#!/bin/bash # file: sitemapper.sh # Pete Nesbitt -April 2011 pete@linux1.ca # creates a simple site map, should work fine as a cgi script on many (most/) hosting services. # it has been written to the lowest common denominator (like GET instead of POST) in order to work in as many hosting environments as possible. # NOTE: this script is expected to run as a cgi file and accessed via a browser. It will work at cmd line but prints out a string of html. ######################################### # Variables ######################################### ######################################### # Things you will probably want to change DOMAIN="example.com" MAIN_INDEX="index.html" # this is for the default top declarations. Change if you use shtml or php or... HOME_URL="linux1.ca" # you can change this one if the auto detect is not working. # if left empty, the script will attemp to determin it, but this is not 100% reliable, just takes a few guesses. #FQ_DOC_ROOT="/var/www/vhosts/example.com/subdomains/mysite/httpdocs" FQ_DOC_ROOT="" ######################################### # Things you might want to change # the robots.txt file only looks at 'paths beginning with', so here we can exclude some deeper key words # NOTE directories will be encased in slashes when searching, this will help eliminate mismatches (except like named dirs) # an empty string will include all your files or directories (except robots exclusions) into the sitemap file. # these will be "if match, exclude from file". White space separated. EXCLUDE_FILES=".htaccess /sitemap.xml*" EXCLUDE_DIRS="" MAP_FILENAME="sitemap.xml" WEB_LOCATION="" # empty means in the root of your website (http://example.com/sitemap.xml) OUTPUT_FILE="`echo \"${FQ_DOC_ROOT}/${WEB_LOCATION}/${MAP_FILENAME}\" | sed -e 's|//|/|g'`" ################################################################################## # usually no need to change anything beyound here # ################################################################################## # can temporarily over-ride this via the web interface, so you can test using a fake-robots.txt and disallow any large dirs. ROBOTS="robots.txt" BU_DATE=`date +%Y-%j-%H%M%S` # used in html section: TITLE="Pete's SiteMapper" LOGO_TITLE="

${TITLE}

" # class supports h1/2/3 VERSION="1.0" VERSION_DATE="April 2011" START_NOW="no" # is reset to continue when user clicks "create sitemap" button WARN=0 # change font colour if no doc root found INPUT_DATA="`echo ${QUERY_STRING} | sed -e 's|\%2F|\/|g;' -e 's|\+|\ |g'`" ########### END OF VARIABLES ######################################################## # do not modify anything beyond here unless you understand what you are up to. ############################## # Functions ############################## # try to find doc root. assume we are in cgi-bin, parallel to doc root find_doc_root() { # only run if FQ_DOC_ROOT is empty, else honour it if [ -z "${FQ_DOC_ROOT}" ];then FOUND=0 ls ../httpdocs &> /dev/null httpdocs_val=$? ls ../html &> /dev/null html_val=$? ls ../htdocs &> /dev/null htdocs_val=$? if [ ${httpdocs_val} -eq 0 ];then DR_DIR="httpdocs" FOUND=1 elif [ ${html_val} -eq 0 ];then DR_DIR="html" FOUND=1 elif [ ${htdocs_val} -eq 0 ];then DR_DIR="htdocs" FOUND=1 fi if [ ${FOUND} -eq 1 ];then FQ_DOC_ROOT="`dirname \`pwd\``/${DR_DIR}" eval OUTPUT_FILE="`echo \"${FQ_DOC_ROOT}/${WEB_LOCATION}/${MAP_FILENAME}\" | sed -e 's|//|/|g'`" else FQ_DOC_ROOT="REQUIRED: DocumentRoot (could not detect your Doc Root)" WARN=1 fi fi } # Reset vars based on user choices show_vars() { cat << EOF
Your Domain: 
(example.com OR www.example.com will work)                                  

Name of home page Index file:  

EOF if [ ${WARN} -eq 1 ];then cat << EOF1 Full Path to DocumentRoot:   EOF1 else cat << EOF1 Full Path to DocumentRoot:   EOF1 fi cat << EOF2

File Names to exclude:  
(whitespace separated list. Use a  *  as a wildcard)                                  

Directory Names to exclude:  
(whitespace separated list. Use a  *  as a wildcard)                                  

Robots file :  
(Only Change for Testing)                                  

New Map File will be named:  

New Files Web Relitive Location:  /
(Do Not include the leading slash)                                  



        

You can store the above information by modifying the variables at the top of the script.
EOF2 } # collect and parse any GET data input update_vars() { # there will be various amounts of data arrive via get # it all will come from this page being reloaded by user via a form # domain_name=XX&main_index=XX&doc_root=XX&robots=XX&map_name=XX&map_pathname=XX&start_process=XX # all form fields are prepopulated so we can set them all # don't try to update vars unless there is data in the q string if [ -n "${INPUT_DATA}" ];then DOMAIN="`echo \"${INPUT_DATA}\" | cut -d\& -f1 | cut -d= -f2 | tr -d [:blank:]`" MAIN_INDEX="`echo \"${INPUT_DATA}\" | cut -d\& -f2 | cut -d= -f2 | tr -d [:blank:]`" FQ_DOC_ROOT="`echo \"${INPUT_DATA}\" | cut -d\& -f3 | cut -d= -f2 | tr -d [:blank:]`" EXCLUDE_FILES="`echo \"${INPUT_DATA}\" | cut -d\& -f4 | cut -d= -f2 | tr -s [:blank:] ' '`" EXCLUDE_DIRS="`echo \"${INPUT_DATA}\" | cut -d\& -f5 | cut -d= -f2 | tr -s [:blank:] ' '`" ROBOTS="`echo \"${INPUT_DATA}\" | cut -d\& -f6 | cut -d= -f2 | tr -d [:blank:]`" MAP_FILENAME="`echo \"${INPUT_DATA}\" | cut -d\& -f7 | cut -d= -f2 | tr -d [:blank:]`" WEB_LOCATION="`echo \"${INPUT_DATA}\" | cut -d\& -f8 | cut -d= -f2 | tr -d [:blank:]`" START_NOW="`echo \"${INPUT_DATA}\" | cut -d\& -f9 | cut -d= -f2 | tr -d [:blank:]`" eval OUTPUT_FILE="`echo \"${FQ_DOC_ROOT}/${WEB_LOCATION}/${MAP_FILENAME}\" | sed -e 's|//|/|g'`" fi } ############### # html code css_style() { cat << EOCSS EOCSS } html_head() { cat << EOH1 Content-Type: text/html; charset=UTF-8 ${TITLE} EOH1 css_style cat << EOH2 EOH2 } html_content_head() { cat << EOCH
${LOGO_TITLE}
version ${VERSION} (${VERSION_DATE})

EOCH } html_foot() { cat << EOFT
EOFT } ###################### # the main part, create a sitemap.xml file build_map() { # save old file (and send info to screen if [ -e ${OUTPUT_FILE} ];then echo -n "Saving existing ${WEB_LOCATION}/${MAP_FILENAME} as ${MAP_FILENAME}_${BU_DATE}" mv -f ${OUTPUT_FILE} ${OUTPUT_FILE}_${BU_DATE} &>/dev/null echo " ...done

" fi # first the consistent part cat >> ${OUTPUT_FILE} << EOH http://${DOMAIN}/ 1.0 weekly http://${DOMAIN}/${MAIN_INDEX} 1.0 weekly EOH # read robots for exclusions DISALLOWED="" # set up for easy egrep for disallow in `grep -i Disallow \`find ${FQ_DOC_ROOT}/ -type f -name ${ROBOTS}\` | tr -d [:blank:] | cut -d':' -f2` do DISALLOWED="${DISALLOWED} ${disallow}" done # find all regular files: for file_name in `find ${FQ_DOC_ROOT}/ -type f` do # is it in robots? disallows are always the begining of the relivant path and don't use wildcards (from what I can find). if [ -n "${DISALLOWED}" ];then for exclude in ${DISALLOWED} do R_STRING="${R_STRING}|${exclude}" done R_STRING_C="`echo \"${R_STRING}\" | sed -e 's/^|//'`" echo ${file_name} | egrep "${R_STRING_C}" &> /dev/null R_MATCH_VAL=$? else R_MATCH_VAL=99 fi R_STRING="" R_STRING_C="" # e/o robot exclude check if [ ${R_MATCH_VAL} -ne 0 ];then # not excluded in robots.txt, continue # check the excludes dir list if [ -n "${EXCLUDE_DIRS}" ];then for exclude in ${EXCLUDE_DIRS} do D_STRING="${D_STRING}|/${exclude}/" done D_STRING_C="`echo \"${D_STRING}\" | sed -e 's/^|//'`" echo ${file_name} | egrep "${D_STRING_C}" &> /dev/null D_MATCH_VAL=$? else D_MATCH_VAL=99 fi D_STRING="" D_STRING_C="" # e/o dir excludes match check if [ ${D_MATCH_VAL} -ne 0 ];then # check the excludes file list if [ -n "${EXCLUDE_FILES}" ];then for exclude in ${EXCLUDE_FILES} do F_STRING="${F_STRING}|${exclude}" done F_STRING_C="`echo \"${F_STRING}\" | sed -e 's/^|//'`" echo ${file_name} | egrep "${F_STRING_C}" &> /dev/null F_MATCH_VAL=$? else F_MATCH_VAL=99 fi F_STRING="" F_STRING_C="" # e/o file excludes match check fi fi # see if we can proceed if [ ${R_MATCH_VAL} -ne 0 ] && [ ${D_MATCH_VAL} -ne 0 ] && [ ${F_MATCH_VAL} -ne 0 ];then LOCATION=`echo ${file_name} | sed "s|^${FQ_DOC_ROOT}|http://${DOMAIN}|"` cat >> ${OUTPUT_FILE} << EOLOC ${LOCATION} EOLOC # send a status dot to screen echo -n "." fi done echo "" >> ${OUTPUT_FILE} } ############################## # start of work ############################## html_head html_content_head # update vars now, if no data, no changes, else it will set START_NOW to continue update_vars find_doc_root # what should we be doing case "${START_NOW}" in no) # default, don't do anything, just print form to check vars: cat << EOSV


EOSV show_vars cat << EOSV2
EOSV2 ;; continue) update_vars cat << EOB


Build new XML-sitemap

Progress:

EOB build_map # just to clean up the url diplay URL_FILE="`echo \"${DOMAIN}/${WEB_LOCATION}/${MAP_FILENAME}\" | sed -e 's|//|/|g'`" cat << EOB2 DONE.

The new sitemap file is:  ${OUTPUT_FILE}

The URL is:  http://${URL_FILE}

If you don't use Google Webtools, you can submit your sitemap pasting this link into your browser:

   http://www.google.com/webmasters/sitemaps/ping?sitemap=http://${URL_FILE}





NOTE: I had a 'back' button but removed it because if run multiple times it would rerun the script.
It is probably best not to use the browsers back button.

Home

You can update this link by modifying the HOME_URL variable at the top of the script.
EOB2 ;; *) show_vars ;; esac html_foot # eof