#!/bin/bash # file: sitemapper.sh # Pete Nesbitt -April 2011 pete@linux1.ca # creates a simple site map, should work fine as a cgi script on many (most/) hosting services. # it has been written to the lowest common denominator (like GET instead of POST) in order to work in as many hosting environments as possible. # NOTE: this script is expected to run as a cgi file and accessed via a browser. It will work at cmd line but prints out a string of html. ######################################### # Variables ######################################### ######################################### # Things you will probably want to change DOMAIN="example.com" MAIN_INDEX="index.html" # this is for the default top declarations. Change if you use shtml or php or... HOME_URL="linux1.ca" # you can change this one if the auto detect is not working. # if left empty, the script will attemp to determin it, but this is not 100% reliable, just takes a few guesses. #FQ_DOC_ROOT="/var/www/vhosts/example.com/subdomains/mysite/httpdocs" FQ_DOC_ROOT="" ######################################### # Things you might want to change # the robots.txt file only looks at 'paths beginning with', so here we can exclude some deeper key words # NOTE directories will be encased in slashes when searching, this will help eliminate mismatches (except like named dirs) # an empty string will include all your files or directories (except robots exclusions) into the sitemap file. # these will be "if match, exclude from file". White space separated. EXCLUDE_FILES=".htaccess /sitemap.xml*" EXCLUDE_DIRS="" MAP_FILENAME="sitemap.xml" WEB_LOCATION="" # empty means in the root of your website (http://example.com/sitemap.xml) OUTPUT_FILE="`echo \"${FQ_DOC_ROOT}/${WEB_LOCATION}/${MAP_FILENAME}\" | sed -e 's|//|/|g'`" ################################################################################## # usually no need to change anything beyound here # ################################################################################## # can temporarily over-ride this via the web interface, so you can test using a fake-robots.txt and disallow any large dirs. ROBOTS="robots.txt" BU_DATE=`date +%Y-%j-%H%M%S` # used in html section: TITLE="Pete's SiteMapper" LOGO_TITLE="