In light of religious terrorist attacks in Paris, please see Category:blasphemy!

Freedom Porn:free technology/comget

From Freedom Porn
Jump to: navigation, search

This is a shell script that downloads all the files that are proposed for deletion on Wikimedia Commons via {{delete}} template. It is also able to move the files out of the download directory (into 'old' subdirectory, it doesn't currently check if the file was deleted or kept) it is done with the '--sort-dir' attribute.

Another possible use is to give it a text file with the names of files to download (see '-i' attribute), or to provide a singe file to download on the command line (only when no other parameters are given).

Note: Due to this script it's much easier to generate Category:Wikimedia preserved content

Licence

Available under the terms of GNU General Public License 3.0

Produced in the year 2013.
GPLv3
For more information please see Freedom Porn:copyrights.


Known bugs

  • Sometimes the download of the file may freeze after the connection is made, for some reason timeout is not hit there. It may be a bug in wget or serious problems with the internet.
  • If the error occurs when reading the list of files proposed for deletion and the '--sort-dir' attribute is given, then all the files will be moved to 'old', which is probably not what the user intended.
  • Terminating the programme leaves a partially downloaded file.
  • Partially downloaded file will block complete download.

Code

comget.sh
#!/bin/bash
 
PROGRAM_NAME="comget"
PROGRAM_VERSION="0,1b"
PROGRAM_AUTHORS="VolodyA! V Anarhist"
PROGRAM_COPYRIGHT_YEAR="2013"
PROGRAM_LICENCE="GPL 3.0"
 
# The coding is in inspired by https://commons.wikimedia.org/wiki/User:Slick/convert_non-interlaced_botscript.sh
 
# this is a comget, wrapper around wget/curl to download files from Wikimedia Commons together with its information
 
# the command line should be in the form
# comget -i file
#  or
# comget --proposed-for-deletion
#  file should have files that need to be downloaded, one name per line
 
PROPOSED=0
SORT=0
 
echo -n "Beginning to work "; date
 
for COMMAND in "$@"; do
	if [ ${COMMAND:0:1} = "-" ]; then
		if [ "${COMMAND}" = "-h" ] || [ "${COMMAND}" = "--help" ]; then
			echo "${PROGRAM_NAME} ${PROGRAM_VERSION}."
			echo " Copyright ${PROGRAM_COPYRIGHT_YEAR} ${PROGRAM_AUTHORS}."
			echo " Released under ${PROGRAM_LICENCE}.\n"
 
			echo "The command line must take the form of:"
			echo " ${0} --input file.lst"
			echo "  or"
			echo " ${0} --proposed-for-deletion"
 
			exit 0
		elif [ "${COMMAND}" = -i ] || [ "${COMMAND}" = --input ]; then
			STATE="--input"
		elif [ "${COMMAND}" = -d ] || [ "${COMMAND}" = --proposed-for-deletion ]; then
			PROPOSED=1
				# http://ruslanspivak.com/2010/06/02/urlencode-and-urldecode-from-a-command-line/
			LIST=$(curl 'https://commons.wikimedia.org/w/index.php?title=Special:WhatLinksHere/Template:Delete&namespace=6&hideredirs=1&limit=5000'|tr '[\n\r]' '\0'|sed 's/.*<ul id=\"mw-whatlinkshere-list\">//'|sed 's/<\/ul>View .*//'| tr '\0' '\n'|sed 's/.*href=\"\/wiki\/File://' | sed 's/\".*//' |python -c 'import sys, urllib as ul; print ul.unquote_plus(sys.stdin.read())' | sed 's/_/ /g' | sort)
		elif [ "$COMMAND" = --sort-dir ]; then
			SORT=1
		fi
	elif [ "$STATE" = "--input" ]; then
		if [ ! -f "$1" ] || [ ! -r "$1" ]; then
			echo "File '$1' doesn't exist or is not readable."
			exit 2
		fi
		LIST=$(cat $1)
		STATE=""
	else
		LIST="${LIST}${1}"
	fi
	shift
done
 
if [ "$STATE" != "" ]; then
	echo "There was no parameter for '$STATE'"
	exit 2
elif [ "$LIST" != "" ]; then
	while read LINE
	do
		if [ "$LINE" = "" ]; then
			continue
		else
			let REQ_COUNT++
		fi
		if [ "${LINE:0:5}" != "File:" ] && [ "${LINE:0:6}" != "Image:" ]; then
			FILE="${LINE}"
			TITLE="File:${LINE}"
		elif [ "${LINE:0:6}" != "Image:" ]; then
			FILE="${LINE:5}"
			TITLE="${LINE}"
		else
			FILE="${LINE:6}"
			TITLE="${LINE}"
		fi
 
		if [ -e "${FILE}" ]; then
			#echo "> file already exists ${FILE}"
			let IDENT_COUNT++
			continue
		fi
 
		echo "> process ${FILE} ... "
 
		XML="`curl --silent --max-time 5 --retry 50 -G --data \"action=query\" --data \"prop=imageinfo\" --data \"format=xml\" --data \"iiprop=url|size\" --data-urlencode \"titles=${TITLE}\" https://commons.wikimedia.org/w/api.php  | tr '[\r\n]' ' '`"
		URL="`echo \"${XML}\" | sed 's/url=/\nurl=/g'| grep "^url=" | head -n 1|sed '/url/s/\(.*url=\)\(.*\)/\2/' |awk -F\\" '{print $2}'`"
 
		if [ "${URL}" = '' ]; then
			echo "- Empty url (maybe a redirect)"
			let ERROR_COUNT++
			continue
		fi
 
		echo "> downloading ${URL} ... "
 
		if wget --connect-timeout=12 --retry-connrefused --read-timeout=20 -t 5 --waitretry=2 -O "${FILE}" "${URL}"; then
			echo "> get current information ... "
			ERR=0
			until curl --silent --max-time 10 -G --data-urlencode "title=${TITLE}" --data "action=raw" "https://commons.wikimedia.org/w/index.php" > "${FILE}.text"; do
				let ERR++
				echo "- Fail ${ERR} (information)"
				if [ ${ERR} = 10 ]; then
					echo "- breaking from loop"
					ERR='final'
					break
				fi
			done
			if [ "${ERR}" = 'final' ]; then
				echo "- Fail final (information)"
				echo "'${FILE}' ${URL}" >> error.log.text
				echo "" > "${FILE}.err"
			else
				let DL_COUNT++
				echo "+ Success"
			fi
		else
			let ERROR_COUNT++
			echo "'${FILE}' ${URL}" >> error.log.text
			echo "- Fail (download)"
			rm -f "${FILE}"
		fi
 
	done <<< "${LIST}"
fi
 
if [ $SORT = 1 ]; then
	LSALL=$(find . -mindepth 1 -maxdepth 1 -type f -not -name '*.text' -printf '%f\n'|sort)
	LSOLD=$(diff <(printf "%s" "${LIST}") <(printf "%s" "${LSALL}")|grep '^> '|sed 's/^..//g')
	while read FILE; do
		if [ "${FILE}" = '' ]; then continue; fi
		mv -t old "./${FILE}" "./${FILE}.text"
	done <<< "${LSOLD}"
fi
 
echo    ""
echo    "================================="
echo    "Files in request:         ${REQ_COUNT}"
echo    "Files already here:       ${IDENT_COUNT}"
echo    "Files downloaded:         ${DL_COUNT}"
echo -n "Files moved to old:       "; printf "%s" "${LSOLD}"|grep -c '^'
echo    "Errors while downloading: ${ERROR_COUNT}"
echo -n "Files in directory:       "; ls -1 |grep -v "\.text$" -c
echo    ""
echo -n "Finished work "; date