Приглашаем посетить
Экономика (economics.niv.ru)

WebSearch

#!/usr/bin/perl

############################################
##                                        ##
##               WebSearch                ##
##           by Darryl Burgdorf           ##
##       (e-mail burgdorf@awsd.com)       ##
##                                        ##
##         last modified: 5/25/97         ##
##           copyright (c) 1997           ##
##                                        ##
##    latest version is available from    ##
##        http://awsd.com/scripts/        ##
##                                        ##
############################################

# COPYRIGHT NOTICE:
#
# Copyright 1997 Darryl C. Burgdorf.  All Rights Reserved.
#
# This program may be used and modified free of charge by anyone, so
# long as this copyright notice and the header above remain intact.  By
# using this program you agree to indemnify Darryl C. Burgdorf from any
# liability.
#
# Selling the code for this program without prior written consent is
# expressly forbidden.  Obtain permission before redistributing this
# program over the Internet or in any other medium.  In all cases
# copyright and header must remain intact.

# VERSION HISTORY:
#
# 1.05  05/25/97  Fixed bug in "Next/Previous Set" forms
# 1.04  05/21/97  Added META tags to searched material
#                 Revised search directory definition methodology
#                 Added ability to search within multiple URLs
#                 Set script to display only X matches per page
#                 Added "as a phrase" option to boolean choices
#                 Changed "value" count to "relevance" computation
#                 Eliminated incomplete "choose directory" option
# 1.03  04/03/97  Added ALT text to searched material
#                 Added display of total number of files searched
#                 Added $avoid to designate files not to be searched
#                 Fixed bug in the way titles are obtained
#                 Fixed bug introduced by "minor code shuffling"
# 1.02  02/17/97  Fixed bug in Get_Date subroutine
# 1.01  02/07/97  Minor code shuffling
# 1.00  02/03/97  Initial "public" release

####################
# GENERAL COMMENTS #
####################

# WebSearch allows users to search for key words in documents located
# on your Web site.  It searches the actual documents, rather than a
# master index file.  On the "up" side, that means the results it
# returns are always up-to-the minute.  On the "down" side, of course,
# it means that it takes a bit longer than some other scripts to return
# those results.  It's a tradeoff, but if you're working with relatively
# small file sets, the difference probably won't be too pronounced.
#
# The script scores the match URLs based upon the frequency with which
# the requested key terms appear in the documents, and also lists the
# date on which each file was last modified.  It searches the basic text
# of the documents, as well as ALT text and any information contained in
# META "keywords" and "description" tags.  It does *not* search HTML
# tags or comments, so, for example, a search for "HTML" won't key on
# every "A HREF" tag.

#########
# SETUP #
#########

# The script, of course, must be called from a search form on a Web
# page.  The form should look something like the form below.  The exact
# structure of the form is not too important, of course, so long as the
# correct fields and options exist.  If you leave out the "boolean" and
# "case" fields, the script will default to a case-insensitive boolean
# "OR" ("any terms") search.

# <FORM METHOD=POST ACTION="http://www.foo.com/cgi-bin/websearch.pl">
#
# <P><CENTER>Terms for which to search (separated by spaces):
# <BR><INPUT TYPE=TEXT NAME="terms" SIZE=60>
#
# <P>Find: <SELECT NAME="boolean">
# <OPTION>any terms<OPTION>all terms<OPTION>as a phrase</SELECT> 
# Case: <SELECT NAME="case">
# <OPTION>insensitive<OPTION>sensitive</SELECT>
#
# <P><INPUT TYPE=SUBMIT VALUE="Search">
#
# </CENTER></FORM></P>

# A variety of variables need to be defined.  First, you should define
# @files as shown below with a list of the full (absolute) paths to the
# directories you wish the script to search.  (All text files in those
# directories will be searched, unless excluded below.)  The absolute
# path of any directory can be found by issuing the UNIX "pwd" command
# while in that directory.  Each directory name should end with a "/" if
# you want only the files in the directory to be searched, and with a
# "/*" if you want the files in the directory and in any immediate
# subdirectories to be searched.

@files = ('/usr/www/foo/scripts/dir1/','/usr/www/foo/scripts/dir2/*');

# If there are particular files you *don't* want included in the
# search, define them in the $avoid variable below.  You need only
# include enough of the file names to distinguish them from other files.
# For example, if you want to exclude all ".txt" files from the search,
# you can simply include "txt" as part of $avoid.

$avoid = "(backup|cgi|pl|txt)";

# Define the variable $cgiurl as the URL of the WebSearch script itself.

$cgiurl = 'http://www.foo.com/cgi-bin/websearch.pl';

# Define the variables $basepath and $baseurl with the absolute path
# and corresponding URL for a "base" directory under which the various
# directories to be searched all lie.  These variables are used to
# convert the UNIX paths to URLs for the results page.

$basepath = '/usr/www/foo/scripts/';
$baseurl = 'http://www.foo.com/scripts/';

# If you wish to be able to specify several other possible URLs -- if,
# for example, some of the files you wish to search fall under a
# different virtual domain or have to be referenced "through" a shopping
# cart or other CGI program -- uncomment the lines below and define
# %otherurls with the desired path/URL pairs.  Note that the script will
# check this variable for matches to convert paths to URLs *before* it
# checks the $basepath and $baseurl variables, so these paths should
# either be subpaths of or completely distinct from the "base" path
# defined above.

# %otherurls = (
#   '/usr/www/foo/scripts/dir2/sub1/',
#   'http://www.foo.com/cgi-bin/some.cgi?access=',
#   '/usr/www/foo/scripts/dir2/sub2/',
#   'http://www.foo.com/scripts/dir2/sub2/another.cgi?read='
#   );

# Define the variable $HitsPerPage with the number of matches you want
# to appear on each results page.

$HitsPerPage = 10;

#####################
# THE ACTUAL SCRIPT #
#####################

# You shouldn't have to change anything in this section!

$version = "1.05";

@day = (Sun,Mon,Tue,Wed,Thu,Fri,Sat);
@month = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec);

read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
@pairs = split(/&/, $buffer);
foreach $pair (@pairs){
	($name, $value) = split(/=/, $pair);
	$name =~ tr/+/ /;
	$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
	$value =~ tr/+/ /;
	$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
	if ($FORM{$name}) {
		$FORM{$name} = "$FORM{$name}, $value";
	}
	else {
		$FORM{$name} = $value;
	}
}

unless ($FORM{'boolean'}) { $FORM{'boolean'} = "any terms"; }
unless ($FORM{'case'}) { $FORM{'case'} = "insensitive"; }

$FORM{'terms'} =~ s/^\s*//;
$FORM{'terms'} =~ s/\s*$//;
if ($FORM{'boolean'} eq "as a phrase") {
	push (@terms,$FORM{'terms'});
}
else {
	@terms = split(/\s+/,$FORM{'terms'});
}

$terms = @terms;
$termscount = $terms;
$bestmatch = 5*$terms;

$matchcount=0;
$filecount=0;

foreach $file (@files) {
	$subdirs = 0;
	if ($file =~ s/\*$//) {
		$subdirs = 1;
	}
	$ls = `ls "$file"`;
	@ls = split(/\s+/,$ls);
	foreach $sub (@ls) {
		if ((-d "$file$sub") && ($subdirs > 0)) {
			$subls = `ls "$file$sub"`;
			@subls = split(/\s+/,$subls);
			foreach $subsub (@subls) {
				if (-T "$file$sub/$subsub") {
					push (@FILES,"$file$sub/$subsub");
				}
			}
		}
		elsif (-T "$file$sub") {
			push (@FILES,"$file$sub");
		}
	}
}

foreach $FILE (@FILES) {
	next if (($avoid) && ($FILE =~ m#$avoid#oi));
	open (FILE,"$FILE");
	@LINES = <FILE>;
	close (FILE);
	$filecount ++;
	$mtime = (stat($FILE))[9];
	$kbytesize = int((((stat($FILE))[7])/1024)+.5);
	$update{$FILE} = &Get_Date;
	$string = join(' ',@LINES);
	$string =~ s/\n/ /g;
	$val{$FILE} = 0;
	if ($string =~ /<TITLE>([^>]+)<\/TITLE>/i) {
		$title{$FILE} = "$1";
		$titlestring = "$title{$FILE}"x$kbytesize;
	}
	elsif ($string =~ /SUBJECT>(.+)POSTER>/i) {
		$title{$FILE} = "$1";
		$titlestring = "$title{$FILE}"x$kbytesize;
	}
	else {
		$title{$FILE} = "$FILE";
		$titlestring = "";
	}
	$title{$FILE} =~ s/^\s*//;
	$title{$FILE} =~ s/\s*$//;
	$string =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/$1/ig;
	$string =~ s/<[^>]*META[^>]+NAME\s*=[ "]*(description|keywords)[ "]+CONTENT\s*=\s*"(([^>"])*)"[^>]*>/$2/ig;
	$string =~ s/<([^>])*>//g;
	$string = $titlestring." ".$string;
	if ($FORM{'boolean'} eq 'all terms') {
		foreach $term (@terms) {
			unless (length($term) < 3) {
				if ($FORM{'case'} eq 'insensitive') {
					$test = ($string =~ s/$term//ig);
					if ($test < 1) {
						$val{$FILE} = 0;
						last;
					}
					else {
						$val{$FILE} = $val{$FILE}+$test;
					}
				}
				elsif ($FORM{'case'} eq 'sensitive') {
					$test = ($string =~ s/$term//g);
					if ($test < 1) {
						$val{$FILE} = 0;
						last;
					}
					else {
						$val{$FILE} = $val{$FILE}+$test;
					}
				}
			}
		}
	}
	else {
		$termscount = 0;
		foreach $term (@terms) {
			unless (length($term) < 3) {
				if ($FORM{'case'} eq 'insensitive') {
					$test = ($string =~ s/$term//ig);
				}
				elsif ($FORM{'case'} eq 'sensitive') {
					$test = ($string =~ s/$term//g);
				}
				$val{$FILE} = $val{$FILE}+$test;
				if ($test > 0) { $termscount++; }
			}
		}
	}
	if ($val{$FILE} > 0) {
		$truval{$FILE} = ($val{$FILE}*($termscount/$terms));
		if ($truval{$FILE} > $bestmatch) {
			$bestmatch = $truval{$FILE};
		}
		$matchcount++;
	}
}

##########
# OUTPUT #
##########

# The script's output can, of course, be modified to suit the specific
# "look" of the site being searched.  Don't try to make major changes,
# though, unless you're reasonably sure that you know what you're doing,
# as there are a *lot* of conditionals and variables in the output.

print "Content-type: text/html\n\n";
print "<HTML><HEAD><TITLE>Search Results</TITLE></HEAD>\n";
print "<BODY BGCOLOR=\"#ffffff\" TEXT=\"#000000\">";
print "<H1 ALIGN=CENTER>Search Results</H1>\n";
print "<P ALIGN=CENTER>Keywords ($FORM{'boolean'}, ";
print "case $FORM{'case'}): <STRONG>";
foreach $term (@terms) {
	unless (length($term) < 3) { print "$term "; }
}
print "</STRONG></P>\n";
print "<P ALIGN=CENTER><SMALL>";
print "(<STRONG>$filecount</STRONG> files searched; ";
print "<STRONG>$matchcount</STRONG> match";
if ($matchcount == 1) {
	print " found)";
}
else {
	print "es found)";
}
print "</SMALL>\n";

unless ($FORM{'first'}) { $FORM{'first'} = 1; }
unless ($FORM{'last'}) { $FORM{'last'} = $HitsPerPage; }

if ($matchcount == 0) {
	print "<P ALIGN=CENTER>No documents match your search criteria!";
	print "<BR>You might want to revise them and try again.</P>\n";
}
else {
	print "<P ALIGN=CENTER><STRONG>Matches $FORM{'first'} ";
	if ($matchcount < $FORM{'last'}) {
		print "- $matchcount</STRONG></P>\n";
	}
	else {
		print "- $FORM{'last'}</STRONG></P>\n";
	}
	print "<HR WIDTH=50%>\n";
	$Count = 0;
	print "<P><UL>\n";
	foreach $key (sort ByValue keys %truval) {
		$Count++;
		next if ($Count < $FORM{'first'});
		last if ($Count > $FORM{'last'});
		$fileurl = $key;
		if (%otherurls) {
			foreach $path (keys %otherurls) {
				$fileurl =~ s/$path/$otherurls{$path}/i;
			}
		}
		$fileurl =~ s/$basepath/$baseurl/i;
		print "<LI><STRONG><A HREF=\"$fileurl\">";
		print "$title{$key}</A></STRONG> ";
		$relevance = int(((100/$bestmatch)*$truval{$key})+.5);
		print "(Relevance: $relevance%)\n";
		print "<BR>Last updated $update{$key}<P>\n";
	}
	print "</UL></P>\n";
	print "<P><CENTER>\n";
	if ($FORM{'first'} > 1) {
		print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"terms\" VALUE=\"";
		foreach $term (@terms) {
			unless (length($term) < 3) { print "$term "; }
		}
		print "\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"boolean\" ";
		print "VALUE=\"$FORM{'boolean'}\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"case\" ";
		print "VALUE=\"$FORM{'case'}\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"first\" ";
		print "VALUE=\"",($FORM{'first'}-$HitsPerPage),"\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"last\" ";
		print "VALUE=\"",($FORM{'last'}-$HitsPerPage),"\">\n";
		print "<INPUT TYPE=SUBMIT ";
		print "VALUE=\"Previous $HitsPerPage Matches\">\n";
		print "</FORM>\n";
	}
	if ($FORM{'last'} < $matchcount) {
		print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"terms\" VALUE=\"";
		foreach $term (@terms) {
			unless (length($term) < 3) { print "$term "; }
		}
		print "\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"boolean\" ";
		print "VALUE=\"$FORM{'boolean'}\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"case\" ";
		print "VALUE=\"$FORM{'case'}\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"first\" ";
		print "VALUE=\"",($FORM{'first'}+$HitsPerPage),"\">\n";
		print "<INPUT TYPE=HIDDEN NAME=\"last\" ";
		print "VALUE=\"",($FORM{'last'}+$HitsPerPage),"\">\n";
		print "<INPUT TYPE=SUBMIT ";
		print "VALUE=\"Next $HitsPerPage Matches\">\n";
		print "</FORM>\n";
	}
	print "</CENTER></P>\n";
}

print "<HR><H2 ALIGN=CENTER>New Search</H2>\n";
print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
print "<P><CENTER>Terms for which to search (separated by spaces):\n";
print "<BR><INPUT TYPE=TEXT NAME=\"terms\" SIZE=60 VALUE=\"";
foreach $term (@terms) {
	unless (length($term) < 3) { print "$term "; }
}
print "\">\n<P>Find: <SELECT NAME=\"boolean\"> ";
if ($FORM{'boolean'} eq 'any terms') {
	print "<OPTION SELECTED>any terms<OPTION>all terms";
	print "<OPTION>as a phrase</SELECT> ";
}
elsif ($FORM{'boolean'} eq 'all terms') {
	print "<OPTION>any terms<OPTION SELECTED>all terms";
	print "<OPTION>as a phrase</SELECT> ";
}
else {
	print "<OPTION>any terms<OPTION>all terms";
	print "<OPTION SELECTED>as a phrase</SELECT> ";
}
print "Case: <SELECT NAME=\"case\"> ";
if ($FORM{'case'} eq 'insensitive') {
	print "<OPTION SELECTED>insensitive<OPTION>sensitive</SELECT>\n";
}
else {
	print "<OPTION>insensitive<OPTION SELECTED>sensitive</SELECT>\n";
}
print "<P><INPUT TYPE=SUBMIT VALUE=\"Search\">";
print "</CENTER></FORM></P><HR>\n";

print "<P ALIGN=CENTER><SMALL>";
print "<A HREF=\"http://awsd.com/scripts/websearch/\">";
print "WebSearch $version</A>";
print "</SMALL></P></BODY></HTML>\n";

exit;

###############
# SUBROUTINES #
###############

# You shouldn't need to change these, either!

sub Get_Date {
	$mtime = time unless ($mtime);
	($mday,$mon,$yr) = (localtime($mtime))[3,4,5];
	$date = "$mday $month[$mon] 19$yr";
	return $date;
}

sub ByValue {
	$aval=$truval{$a};
	$bval=$truval{$b};
	$bval<=>$aval;
}