wget-mirror.sh 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #!/bin/bash
  2. # Copyright 2013 Andrew Lewman, 2017 Laxdaela Technology LLC
  3. # Originally written for [redacted] national agency for darknet market mirroring in 2013. Updated for other purposes over the years.
  4. # Save your login cookies, feed them to wget and you too can have 500TB of data in years
  5. # License: http://code.lewman.com/andrew/random-bash-scripts/src/master/LICENSE.md
  6. # General error trap and output
  7. tempfiles=( )
  8. cleanup() {
  9. rm -f "${tempfiles[@]}"
  10. }
  11. trap cleanup 0
  12. error() {
  13. local parent_lineno="$1"
  14. local message="$2"
  15. local code="${3:-1}"
  16. if [[ -n "$message" ]] ; then
  17. echo "Error on or near line ${parent_lineno}: ${message}; exiting with status ${code}"
  18. else
  19. echo "Error on or near line ${parent_lineno}; exiting with status ${code}"
  20. fi
  21. exit "${code}"
  22. }
  23. trap 'error ${LINENO}' ERR
  24. if [ ! $# == 1 ]; then
  25. echo "You need to specify a site to mirror. Specify http://site.example NOT site.example."
  26. exit 1
  27. fi
  28. SITE="$1"
  29. DATE="$(date "+%Y-%m-%d")"
  30. # todo: parse http/https out of the url if it exists, if not, just use the site name.
  31. if [ $(echo $1 | cut -c 1-4) == "http" ]; then
  32. DOMAIN="$(echo ${SITE} | awk -F[/:] '{print $4}')"
  33. else
  34. DOMAIN="${SITE}"
  35. fi
  36. USERAGENT="Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0" # torbrowser UA
  37. /usr/bin/wget -e use_proxy=yes -e http_proxy=127.0.0.1:8118 -e robots=off --no-dns-cache --no-check-certificate --continue --mirror --recursive --no-parent --convert-links --adjust-extension --page-requisites --random-wait --tries 5 --waitretry 5 --timeout 90 --warc-header "operator: Laxdaela Technology Darknet Archive Team" --warc-cdx --warc-file="${DOMAIN}-${DATE}" -U "${USERAGENT}" "${SITE}"