Here is the script I use to make a backup copy of the wiki's content.
#!/bin/bash
# Copyright 2008 Joe Wells.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You can find the GNU General Public License at
# <URL:http://www.gnu.org/licenses/>.
# Limitations of this backup script:
#
# 1. Only the latest versions of wiki pages and uploaded files are
# backed up. Older versions and edit comments are not backed up.
#
# 2. The wiki settings are not backed up (although the Theme CSS and
# image files are backed up).
#
# 3. Per-page security settings are not backed up.
#
# 4. All wiki pages and uploaded files are backed up, including things
# that have not changed recently. This could be resource
# intensive, especially for uploaded files.
#
# 5. The timestamps of all backed up files are always set to the time
# the backup was done, rather than when the data was last modified
# on the wiki.
#
# 6. Pages and per-page uploaded files that have been deleted on the
# wiki are not removed from a pre-existing local backup.
#
# 7. This script must be run by the administrator because it aborts if
# it does not have permission to read all wiki pages.
#
# 8. This script assumes the “All Pages” page contains just a
# [[TitleIndex]] directive. This script also assumes unguaranteed
# details about the HTML generated for this directive, and also the
# HTML generated for the per-page files list. This script also
# assumes unguaranteed details about the URLs used for accessing
# the per-page files list and the raw page contents. If any of
# these details change, this script will silently fail to do proper
# backups.
#
# Advantages of JabberWokky's spotget script over this one (and ideas
# for improvements of this one):
#
# 1. spotget names the backup directory after the wiki.
#
# 2. spotget has an option to put backup in .tar.gz file with date and
# wiki name in the archive's file name.
#
# 3. spotget allows (in fact, requires) giving only the Wiki Spot
# wiki name (instead of the URL of a Sycamore-based wiki).
#
# 4. spotget will simply get all accessible content (instead of
# aborting if some content is not accessible).
#
# 5. spotget has some help text.
#
# 6. spotget stores the tinylogo.png (defaulting to Wiki Spot's) as
# the .favicon.png file for the backup directory and makes a
# .directory file (this affects the visual display of some file
# browsers presumably).
#
# 7. spotget tests for errors in making the backup directory.
#
# 8. spotget can be edited to configure it to be quiet.
#
# Differences with spotget:
#
# 1. spotget uses a different method of extracting the list of pages
# from the HTML for the “All Pages” page, and also for extracting
# the list of subfiles attached to each page. spotget's method
# might (or might not) be more robust.
#
# 2. spotget uses the urlencode program (what package supplies this
# program?) in a way that might make it more robust in the handling
# of strange page names. (I think I handle strange page names
# correctly, although differently.)
#
# 3. spotget's handling of slashes in page names is probably a bit
# simpler (it uses urlencode).
#
# Advantages of this script over spotget:
#
# 1. Support for relaying and proxying. (You are unlikely to need
# this, but if you do it's there.)
#
# 2. Support for using a browser cookie file so that a logged-in
# administrator can back up all pages.
#
# 3. Can work for any Sycamore-based wiki, not just those at Wiki
# Spot.
CookieFile="`echo ~/.mozilla/firefox/*.default/cookies.txt`"
Debugging=f
Tracing=f
while [ $# -ne 0 ]; do
case "$1" in
--host) WikiHost="$2"; shift;;
--proxy) WikiProxyHost="$2"; shift;;
--port) WikiPort="$2"; shift;;
--relay) RelayHost="$2"; shift;;
--cookie-file) CookieFile="$2"; shift;;
--debug) Debugging=t;;
--trace) Tracing=t;;
*) echo illegal arguments, aborting! 1>&2
exit 1;;
esac
shift
done
case "$WikiHost" in
'') echo must specify wiki host! aborting! 1>&2
exit 1;;
esac
case "$WikiProxyHost" in
'') WikiProxyHost="$WikiHost";;
esac
case "$WikiPort" in
'') WikiPort=80;;
esac
debug () {
case "$Debugging" in
t) echo "$*" 1>&2;;
*) ;;
esac
}
trace () {
case "$Tracing" in
t) echo "== $* ======================================================================" 1>&2;;
*) ;;
esac
}
debug "CookieFile: [$CookieFile]"
# “wikispot_2Eorg_2CID” means “wikispot.org,ID”
CookieName=wikispot_2Eorg_2CID
# Find cookie value automatically.
CookieValue="`perl -n -e "m,^\.wikispot\.org\tTRUE\t/\tFALSE\t\d+\t$CookieName\t\\"([^\\"\\"]+)\\"\\$, && print \\"\\$1\n\\"" "$CookieFile"`"
debug "CookieValue: [$CookieValue]"
case "$CookieValue" in
'') echo 'could not find cookie! perhaps you are not logged in to wikispot? aborting!' 1>&2
exit 1;;
esac
# I need to unset this because of a personal configuration issue. (I
# set it to a private locale which does not exist on the relay host.
# This results in lots of warning messages and prevents my LC_CTYPE
# setting from working. Unsetting this variable silences these
# warning messages and re-enables LC_CTYPE.)
unset LC_TIME
trap 'rm -f tempfile; exit' EXIT SIGINT SIGQUIT SIGTERM
fetch_url () {
case "$CookieName$CookieValue$1" in
*\'*) echo unhandled case, aborting! 1>&2
exit 1;;
esac
#debug "fetching URL: $1"
#
# The proxy host and relay host mechanisms are used because of the
# stupid way I hide my wiki accesses from sitemeter; use whichever
# one works better for you. (Further explanation: I don't hide my
# local host because sitemeter won't hide an individual IP on free
# accounts and there is too high a chance non-admin users will
# share the first 24 bits of their IP with me (because we all live
# in the same block). So I relay all my accesses through a remote
# host.)
#
# I don't know why, but the wikispot software blocks access with
# wget to the "?action=raw" URLs. So I pretend to be Mozilla.
#
case "$RelayHost" in
'') debug Command: wget --quiet --user-agent=Mozilla/5.0 --header="Host: $WikiHost" --header "Cookie: $CookieName=$CookieValue" --output-document=- "http://$WikiProxyHost:$WikiPort/$1"
wget --quiet --user-agent=Mozilla/5.0 --header="Host: $WikiHost" --header "Cookie: $CookieName=$CookieValue" --output-document=- "http://$WikiProxyHost:$WikiPort/$1" < /dev/null;;
*) debug Command: ssh "$RelayHost" "wget --quiet --user-agent=Mozilla/5.0 --header='Host: $WikiHost' --header 'Cookie: $CookieName=$CookieValue' --output-document=- 'http://$WikiProxyHost:$WikiPort/$1'"
ssh "$RelayHost" "wget --quiet --user-agent=Mozilla/5.0 --header='Host: $WikiHost' --header 'Cookie: $CookieName=$CookieValue' --output-document=- 'http://$WikiProxyHost:$WikiPort/$1'" < /dev/null;;
esac
}
trace just before getting page list
fetch_url "All_Pages" |
#cat; exit
#grep Wiki_Settings; exit
#grep Map_Problem_Report |
#grep Front_Page |
perl -n -e 's,^.*a href="([^#/\":][^/\":]*(?:/[^\"]*)?)".*,$1, && print;' |
#cat; exit
while read f; do
trace beginning of loop body for page list
echo backing up page "$f"
# I have to store the backup in a file with a ".f" extension
# because there are cases where both XYZ and XYZ/ABC are valid
# wiki pages.
s="backup.d/$f"
t="$s.f"
d="`dirname "$t"`"
debug "f: [$f], s: [$s], t: [$t], d: [$d]"
mkdir -p "$d"
trace just before fetching raw wiki text
fetch_url "$f?action=raw" > tempfile
if head -n 1 tempfile | grep --quiet 'DOCTYPE HTML PUBLIC'; then
echo probable permissions error, backup failed, aborting! 1>&2
exit 1
fi
cp tempfile "$t"
trace just before fetching subfile web page
fetch_url "$f?action=Files" |
#grep 'a href'; exit
perl -n -0777 -e 'while (m,<a href="/'"$f"'\?action=Files&do=view&target=([^\"]+)">([^<]+)</a>,g) { ($x,$y) = ($1,$2); $z = $y; $z =~ s/ /%20/g; if ($x eq $z) { print "$y\n"; } else { print STDERR "can not happen (I hope)\n"; }}' |
while read g; do
echo backing up page "$f" subfile "$g"
debug "g: [$g]"
mkdir -p "$s.s"
fetch_url "$f?sendfile=true&file=$g" > "$s.s/$g"
done
done
trace end of program

