Backup Script

EditEdit InfoInfo TalkTalk
Search:    

Here is the script I use to make a backup copy of the wiki's content.

#!/bin/bash

# Copyright 2008 Joe Wells.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You can find the GNU General Public License at
# <URL:http://www.gnu.org/licenses/>.

# Limitations of this backup script:
#
# 1. Only the latest versions of wiki pages and uploaded files are
#    backed up.  Older versions and edit comments are not backed up.
#
# 2. The wiki settings are not backed up (although the Theme CSS and
#    image files are backed up).
#
# 3. Per-page security settings are not backed up.
#
# 4. All wiki pages and uploaded files are backed up, including things
#    that have not changed recently.  This could be resource
#    intensive, especially for uploaded files.
#
# 5. The timestamps of all backed up files are always set to the time
#    the backup was done, rather than when the data was last modified
#    on the wiki.
#
# 6. Pages and per-page uploaded files that have been deleted on the
#    wiki are not removed from a pre-existing local backup.
#
# 7. This script must be run by the administrator because it aborts if
#    it does not have permission to read all wiki pages.
#
# 8. This script assumes the “All Pages” page contains just a
#    [[TitleIndex]] directive.  This script also assumes unguaranteed
#    details about the HTML generated for this directive, and also the
#    HTML generated for the per-page files list.  This script also
#    assumes unguaranteed details about the URLs used for accessing
#    the per-page files list and the raw page contents.  If any of
#    these details change, this script will silently fail to do proper
#    backups.
#
# Advantages of JabberWokky's spotget script over this one (and ideas
# for improvements of this one):
#
# 1. spotget names the backup directory after the wiki.
#
# 2. spotget has an option to put backup in .tar.gz file with date and
#    wiki name in the archive's file name.
#
# 3. spotget allows (in fact, requires) giving only the Wiki Spot
#    wiki name (instead of the URL of a Sycamore-based wiki).
#
# 4. spotget will simply get all accessible content (instead of
#    aborting if some content is not accessible).
#
# 5. spotget has some help text.
#
# 6. spotget stores the tinylogo.png (defaulting to Wiki Spot's) as
#    the .favicon.png file for the backup directory and makes a
#    .directory file (this affects the visual display of some file
#    browsers presumably).
#
# 7. spotget tests for errors in making the backup directory.
#
# 8. spotget can be edited to configure it to be quiet.
#
# Differences with spotget:
#
# 1. spotget uses a different method of extracting the list of pages
#    from the HTML for the “All Pages” page, and also for extracting
#    the list of subfiles attached to each page.  spotget's method
#    might (or might not) be more robust.
#
# 2. spotget uses the urlencode program (what package supplies this
#    program?) in a way that might make it more robust in the handling
#    of strange page names.  (I think I handle strange page names
#    correctly, although differently.)
#
# 3. spotget's handling of slashes in page names is probably a bit
#    simpler (it uses urlencode).
#
# Advantages of this script over spotget:
#
# 1. Support for relaying and proxying.  (You are unlikely to need
#    this, but if you do it's there.)
#
# 2. Support for using a browser cookie file so that a logged-in
#    administrator can back up all pages.
#
# 3. Can work for any Sycamore-based wiki, not just those at Wiki
#    Spot.

CookieFile="`echo ~/.mozilla/firefox/*.default/cookies.txt`"
Debugging=f
Tracing=f

while [ $# -ne 0 ]; do
    case "$1" in
    --host)  WikiHost="$2"; shift;;
    --proxy) WikiProxyHost="$2"; shift;;
    --port)  WikiPort="$2"; shift;;
    --relay) RelayHost="$2"; shift;;
    --cookie-file) CookieFile="$2"; shift;;
    --debug) Debugging=t;;
    --trace) Tracing=t;;
    *)       echo illegal arguments, aborting! 1>&2
             exit 1;;
    esac
    shift
done

case "$WikiHost" in
'') echo must specify wiki host!  aborting!  1>&2
    exit 1;;
esac

case "$WikiProxyHost" in
'') WikiProxyHost="$WikiHost";;
esac

case "$WikiPort" in
'') WikiPort=80;;
esac

debug () {
    case "$Debugging" in
    t) echo "$*" 1>&2;;
    *) ;;
    esac
}

trace () {
    case "$Tracing" in
    t) echo "== $* ======================================================================" 1>&2;;
    *) ;;
    esac
}

debug "CookieFile: [$CookieFile]"
# “wikispot_2Eorg_2CID” means “wikispot.org,ID”
CookieName=wikispot_2Eorg_2CID
# Find cookie value automatically.
CookieValue="`perl -n -e "m,^\.wikispot\.org\tTRUE\t/\tFALSE\t\d+\t$CookieName\t\\"([^\\"\\"]+)\\"\\$, && print \\"\\$1\n\\"" "$CookieFile"`"
debug "CookieValue: [$CookieValue]"
case "$CookieValue" in
'') echo 'could not find cookie!  perhaps you are not logged in to wikispot?  aborting!' 1>&2
    exit 1;;
esac

# I need to unset this because of a personal configuration issue.  (I
# set it to a private locale which does not exist on the relay host.
# This results in lots of warning messages and prevents my LC_CTYPE
# setting from working.  Unsetting this variable silences these
# warning messages and re-enables LC_CTYPE.)
unset LC_TIME

trap 'rm -f tempfile; exit' EXIT SIGINT SIGQUIT SIGTERM

fetch_url () {
    case "$CookieName$CookieValue$1" in
        *\'*) echo unhandled case, aborting! 1>&2
            exit 1;;
    esac
    #debug "fetching URL: $1"
    #
    # The proxy host and relay host mechanisms are used because of the
    # stupid way I hide my wiki accesses from sitemeter; use whichever
    # one works better for you.  (Further explanation: I don't hide my
    # local host because sitemeter won't hide an individual IP on free
    # accounts and there is too high a chance non-admin users will
    # share the first 24 bits of their IP with me (because we all live
    # in the same block).  So I relay all my accesses through a remote
    # host.)
    #
    # I don't know why, but the wikispot software blocks access with
    # wget to the "?action=raw" URLs.  So I pretend to be Mozilla.
    #
    case "$RelayHost" in
    '') debug Command: wget --quiet --user-agent=Mozilla/5.0 --header="Host: $WikiHost" --header "Cookie: $CookieName=$CookieValue" --output-document=- "http://$WikiProxyHost:$WikiPort/$1"
        wget --quiet --user-agent=Mozilla/5.0 --header="Host: $WikiHost" --header "Cookie: $CookieName=$CookieValue" --output-document=- "http://$WikiProxyHost:$WikiPort/$1" < /dev/null;;
    *)  debug Command: ssh "$RelayHost" "wget --quiet --user-agent=Mozilla/5.0 --header='Host: $WikiHost' --header 'Cookie: $CookieName=$CookieValue' --output-document=- 'http://$WikiProxyHost:$WikiPort/$1'"
        ssh "$RelayHost" "wget --quiet --user-agent=Mozilla/5.0 --header='Host: $WikiHost' --header 'Cookie: $CookieName=$CookieValue' --output-document=- 'http://$WikiProxyHost:$WikiPort/$1'" < /dev/null;;
    esac
}

trace just before getting page list
fetch_url "All_Pages" |
#cat; exit
#grep Wiki_Settings; exit
#grep Map_Problem_Report |
#grep Front_Page |
perl -n -e 's,^.*a href="([^#/\":][^/\":]*(?:/[^\"]*)?)".*,$1, && print;' |
#cat; exit
while read f; do
    trace beginning of loop body for page list
    echo backing up page "$f"
    # I have to store the backup in a file with a ".f" extension
    # because there are cases where both XYZ and XYZ/ABC are valid
    # wiki pages.
    s="backup.d/$f"
    t="$s.f"
    d="`dirname "$t"`"
    debug "f: [$f], s: [$s], t: [$t], d: [$d]"
    mkdir -p "$d"
    trace just before fetching raw wiki text
    fetch_url "$f?action=raw" > tempfile
    if head -n 1 tempfile | grep --quiet 'DOCTYPE HTML PUBLIC'; then
        echo probable permissions error, backup failed, aborting! 1>&2
        exit 1
    fi
    cp tempfile "$t"
    trace just before fetching subfile web page
    fetch_url "$f?action=Files" |
    #grep 'a href'; exit
    perl -n -0777 -e 'while (m,<a href="/'"$f"'\?action=Files&amp;do=view&target=([^\"]+)">([^<]+)</a>,g) { ($x,$y) = ($1,$2); $z = $y; $z =~ s/ /%20/g; if ($x eq $z) { print "$y\n"; } else { print STDERR "can not happen (I hope)\n"; }}' |
    while read g; do
        echo backing up page "$f" subfile "$g"
        debug "g: [$g]"
        mkdir -p "$s.s"
        fetch_url "$f?sendfile=true&amp;file=$g" > "$s.s/$g"
    done
done
trace end of program
This is a Wiki Spot wiki. Wiki Spot is a non-profit organization that helps communities collaborate via wikis.