Приглашаем посетить
WebSearch
#!/usr/bin/perl
############################################
## ##
## WebSearch ##
## by Darryl Burgdorf ##
## (e-mail burgdorf@awsd.com) ##
## ##
## last modified: 5/25/97 ##
## copyright (c) 1997 ##
## ##
## latest version is available from ##
## http://awsd.com/scripts/ ##
## ##
############################################
# COPYRIGHT NOTICE:
#
# Copyright 1997 Darryl C. Burgdorf. All Rights Reserved.
#
# This program may be used and modified free of charge by anyone, so
# long as this copyright notice and the header above remain intact. By
# using this program you agree to indemnify Darryl C. Burgdorf from any
# liability.
#
# Selling the code for this program without prior written consent is
# expressly forbidden. Obtain permission before redistributing this
# program over the Internet or in any other medium. In all cases
# copyright and header must remain intact.
# VERSION HISTORY:
#
# 1.05 05/25/97 Fixed bug in "Next/Previous Set" forms
# 1.04 05/21/97 Added META tags to searched material
# Revised search directory definition methodology
# Added ability to search within multiple URLs
# Set script to display only X matches per page
# Added "as a phrase" option to boolean choices
# Changed "value" count to "relevance" computation
# Eliminated incomplete "choose directory" option
# 1.03 04/03/97 Added ALT text to searched material
# Added display of total number of files searched
# Added $avoid to designate files not to be searched
# Fixed bug in the way titles are obtained
# Fixed bug introduced by "minor code shuffling"
# 1.02 02/17/97 Fixed bug in Get_Date subroutine
# 1.01 02/07/97 Minor code shuffling
# 1.00 02/03/97 Initial "public" release
####################
# GENERAL COMMENTS #
####################
# WebSearch allows users to search for key words in documents located
# on your Web site. It searches the actual documents, rather than a
# master index file. On the "up" side, that means the results it
# returns are always up-to-the minute. On the "down" side, of course,
# it means that it takes a bit longer than some other scripts to return
# those results. It's a tradeoff, but if you're working with relatively
# small file sets, the difference probably won't be too pronounced.
#
# The script scores the match URLs based upon the frequency with which
# the requested key terms appear in the documents, and also lists the
# date on which each file was last modified. It searches the basic text
# of the documents, as well as ALT text and any information contained in
# META "keywords" and "description" tags. It does *not* search HTML
# tags or comments, so, for example, a search for "HTML" won't key on
# every "A HREF" tag.
#########
# SETUP #
#########
# The script, of course, must be called from a search form on a Web
# page. The form should look something like the form below. The exact
# structure of the form is not too important, of course, so long as the
# correct fields and options exist. If you leave out the "boolean" and
# "case" fields, the script will default to a case-insensitive boolean
# "OR" ("any terms") search.
# <FORM METHOD=POST ACTION="http://www.foo.com/cgi-bin/websearch.pl">
#
# <P><CENTER>Terms for which to search (separated by spaces):
# <BR><INPUT TYPE=TEXT NAME="terms" SIZE=60>
#
# <P>Find: <SELECT NAME="boolean">
# <OPTION>any terms<OPTION>all terms<OPTION>as a phrase</SELECT>
# Case: <SELECT NAME="case">
# <OPTION>insensitive<OPTION>sensitive</SELECT>
#
# <P><INPUT TYPE=SUBMIT VALUE="Search">
#
# </CENTER></FORM></P>
# A variety of variables need to be defined. First, you should define
# @files as shown below with a list of the full (absolute) paths to the
# directories you wish the script to search. (All text files in those
# directories will be searched, unless excluded below.) The absolute
# path of any directory can be found by issuing the UNIX "pwd" command
# while in that directory. Each directory name should end with a "/" if
# you want only the files in the directory to be searched, and with a
# "/*" if you want the files in the directory and in any immediate
# subdirectories to be searched.
@files = ('/usr/www/foo/scripts/dir1/','/usr/www/foo/scripts/dir2/*');
# If there are particular files you *don't* want included in the
# search, define them in the $avoid variable below. You need only
# include enough of the file names to distinguish them from other files.
# For example, if you want to exclude all ".txt" files from the search,
# you can simply include "txt" as part of $avoid.
$avoid = "(backup|cgi|pl|txt)";
# Define the variable $cgiurl as the URL of the WebSearch script itself.
$cgiurl = 'http://www.foo.com/cgi-bin/websearch.pl';
# Define the variables $basepath and $baseurl with the absolute path
# and corresponding URL for a "base" directory under which the various
# directories to be searched all lie. These variables are used to
# convert the UNIX paths to URLs for the results page.
$basepath = '/usr/www/foo/scripts/';
$baseurl = 'http://www.foo.com/scripts/';
# If you wish to be able to specify several other possible URLs -- if,
# for example, some of the files you wish to search fall under a
# different virtual domain or have to be referenced "through" a shopping
# cart or other CGI program -- uncomment the lines below and define
# %otherurls with the desired path/URL pairs. Note that the script will
# check this variable for matches to convert paths to URLs *before* it
# checks the $basepath and $baseurl variables, so these paths should
# either be subpaths of or completely distinct from the "base" path
# defined above.
# %otherurls = (
# '/usr/www/foo/scripts/dir2/sub1/',
# 'http://www.foo.com/cgi-bin/some.cgi?access=',
# '/usr/www/foo/scripts/dir2/sub2/',
# 'http://www.foo.com/scripts/dir2/sub2/another.cgi?read='
# );
# Define the variable $HitsPerPage with the number of matches you want
# to appear on each results page.
$HitsPerPage = 10;
#####################
# THE ACTUAL SCRIPT #
#####################
# You shouldn't have to change anything in this section!
$version = "1.05";
@day = (Sun,Mon,Tue,Wed,Thu,Fri,Sat);
@month = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec);
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
@pairs = split(/&/, $buffer);
foreach $pair (@pairs){
($name, $value) = split(/=/, $pair);
$name =~ tr/+/ /;
$name =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
$value =~ tr/+/ /;
$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
if ($FORM{$name}) {
$FORM{$name} = "$FORM{$name}, $value";
}
else {
$FORM{$name} = $value;
}
}
unless ($FORM{'boolean'}) { $FORM{'boolean'} = "any terms"; }
unless ($FORM{'case'}) { $FORM{'case'} = "insensitive"; }
$FORM{'terms'} =~ s/^\s*//;
$FORM{'terms'} =~ s/\s*$//;
if ($FORM{'boolean'} eq "as a phrase") {
push (@terms,$FORM{'terms'});
}
else {
@terms = split(/\s+/,$FORM{'terms'});
}
$terms = @terms;
$termscount = $terms;
$bestmatch = 5*$terms;
$matchcount=0;
$filecount=0;
foreach $file (@files) {
$subdirs = 0;
if ($file =~ s/\*$//) {
$subdirs = 1;
}
$ls = `ls "$file"`;
@ls = split(/\s+/,$ls);
foreach $sub (@ls) {
if ((-d "$file$sub") && ($subdirs > 0)) {
$subls = `ls "$file$sub"`;
@subls = split(/\s+/,$subls);
foreach $subsub (@subls) {
if (-T "$file$sub/$subsub") {
push (@FILES,"$file$sub/$subsub");
}
}
}
elsif (-T "$file$sub") {
push (@FILES,"$file$sub");
}
}
}
foreach $FILE (@FILES) {
next if (($avoid) && ($FILE =~ m#$avoid#oi));
open (FILE,"$FILE");
@LINES = <FILE>;
close (FILE);
$filecount ++;
$mtime = (stat($FILE))[9];
$kbytesize = int((((stat($FILE))[7])/1024)+.5);
$update{$FILE} = &Get_Date;
$string = join(' ',@LINES);
$string =~ s/\n/ /g;
$val{$FILE} = 0;
if ($string =~ /<TITLE>([^>]+)<\/TITLE>/i) {
$title{$FILE} = "$1";
$titlestring = "$title{$FILE}"x$kbytesize;
}
elsif ($string =~ /SUBJECT>(.+)POSTER>/i) {
$title{$FILE} = "$1";
$titlestring = "$title{$FILE}"x$kbytesize;
}
else {
$title{$FILE} = "$FILE";
$titlestring = "";
}
$title{$FILE} =~ s/^\s*//;
$title{$FILE} =~ s/\s*$//;
$string =~ s/<[^>]*\s+ALT\s*=\s*"(([^>"])*)"[^>]*>/$1/ig;
$string =~ s/<[^>]*META[^>]+NAME\s*=[ "]*(description|keywords)[ "]+CONTENT\s*=\s*"(([^>"])*)"[^>]*>/$2/ig;
$string =~ s/<([^>])*>//g;
$string = $titlestring." ".$string;
if ($FORM{'boolean'} eq 'all terms') {
foreach $term (@terms) {
unless (length($term) < 3) {
if ($FORM{'case'} eq 'insensitive') {
$test = ($string =~ s/$term//ig);
if ($test < 1) {
$val{$FILE} = 0;
last;
}
else {
$val{$FILE} = $val{$FILE}+$test;
}
}
elsif ($FORM{'case'} eq 'sensitive') {
$test = ($string =~ s/$term//g);
if ($test < 1) {
$val{$FILE} = 0;
last;
}
else {
$val{$FILE} = $val{$FILE}+$test;
}
}
}
}
}
else {
$termscount = 0;
foreach $term (@terms) {
unless (length($term) < 3) {
if ($FORM{'case'} eq 'insensitive') {
$test = ($string =~ s/$term//ig);
}
elsif ($FORM{'case'} eq 'sensitive') {
$test = ($string =~ s/$term//g);
}
$val{$FILE} = $val{$FILE}+$test;
if ($test > 0) { $termscount++; }
}
}
}
if ($val{$FILE} > 0) {
$truval{$FILE} = ($val{$FILE}*($termscount/$terms));
if ($truval{$FILE} > $bestmatch) {
$bestmatch = $truval{$FILE};
}
$matchcount++;
}
}
##########
# OUTPUT #
##########
# The script's output can, of course, be modified to suit the specific
# "look" of the site being searched. Don't try to make major changes,
# though, unless you're reasonably sure that you know what you're doing,
# as there are a *lot* of conditionals and variables in the output.
print "Content-type: text/html\n\n";
print "<HTML><HEAD><TITLE>Search Results</TITLE></HEAD>\n";
print "<BODY BGCOLOR=\"#ffffff\" TEXT=\"#000000\">";
print "<H1 ALIGN=CENTER>Search Results</H1>\n";
print "<P ALIGN=CENTER>Keywords ($FORM{'boolean'}, ";
print "case $FORM{'case'}): <STRONG>";
foreach $term (@terms) {
unless (length($term) < 3) { print "$term "; }
}
print "</STRONG></P>\n";
print "<P ALIGN=CENTER><SMALL>";
print "(<STRONG>$filecount</STRONG> files searched; ";
print "<STRONG>$matchcount</STRONG> match";
if ($matchcount == 1) {
print " found)";
}
else {
print "es found)";
}
print "</SMALL>\n";
unless ($FORM{'first'}) { $FORM{'first'} = 1; }
unless ($FORM{'last'}) { $FORM{'last'} = $HitsPerPage; }
if ($matchcount == 0) {
print "<P ALIGN=CENTER>No documents match your search criteria!";
print "<BR>You might want to revise them and try again.</P>\n";
}
else {
print "<P ALIGN=CENTER><STRONG>Matches $FORM{'first'} ";
if ($matchcount < $FORM{'last'}) {
print "- $matchcount</STRONG></P>\n";
}
else {
print "- $FORM{'last'}</STRONG></P>\n";
}
print "<HR WIDTH=50%>\n";
$Count = 0;
print "<P><UL>\n";
foreach $key (sort ByValue keys %truval) {
$Count++;
next if ($Count < $FORM{'first'});
last if ($Count > $FORM{'last'});
$fileurl = $key;
if (%otherurls) {
foreach $path (keys %otherurls) {
$fileurl =~ s/$path/$otherurls{$path}/i;
}
}
$fileurl =~ s/$basepath/$baseurl/i;
print "<LI><STRONG><A HREF=\"$fileurl\">";
print "$title{$key}</A></STRONG> ";
$relevance = int(((100/$bestmatch)*$truval{$key})+.5);
print "(Relevance: $relevance%)\n";
print "<BR>Last updated $update{$key}<P>\n";
}
print "</UL></P>\n";
print "<P><CENTER>\n";
if ($FORM{'first'} > 1) {
print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"terms\" VALUE=\"";
foreach $term (@terms) {
unless (length($term) < 3) { print "$term "; }
}
print "\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"boolean\" ";
print "VALUE=\"$FORM{'boolean'}\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"case\" ";
print "VALUE=\"$FORM{'case'}\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"first\" ";
print "VALUE=\"",($FORM{'first'}-$HitsPerPage),"\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"last\" ";
print "VALUE=\"",($FORM{'last'}-$HitsPerPage),"\">\n";
print "<INPUT TYPE=SUBMIT ";
print "VALUE=\"Previous $HitsPerPage Matches\">\n";
print "</FORM>\n";
}
if ($FORM{'last'} < $matchcount) {
print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"terms\" VALUE=\"";
foreach $term (@terms) {
unless (length($term) < 3) { print "$term "; }
}
print "\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"boolean\" ";
print "VALUE=\"$FORM{'boolean'}\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"case\" ";
print "VALUE=\"$FORM{'case'}\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"first\" ";
print "VALUE=\"",($FORM{'first'}+$HitsPerPage),"\">\n";
print "<INPUT TYPE=HIDDEN NAME=\"last\" ";
print "VALUE=\"",($FORM{'last'}+$HitsPerPage),"\">\n";
print "<INPUT TYPE=SUBMIT ";
print "VALUE=\"Next $HitsPerPage Matches\">\n";
print "</FORM>\n";
}
print "</CENTER></P>\n";
}
print "<HR><H2 ALIGN=CENTER>New Search</H2>\n";
print "<FORM METHOD=POST ACTION=\"$cgiurl\">\n";
print "<P><CENTER>Terms for which to search (separated by spaces):\n";
print "<BR><INPUT TYPE=TEXT NAME=\"terms\" SIZE=60 VALUE=\"";
foreach $term (@terms) {
unless (length($term) < 3) { print "$term "; }
}
print "\">\n<P>Find: <SELECT NAME=\"boolean\"> ";
if ($FORM{'boolean'} eq 'any terms') {
print "<OPTION SELECTED>any terms<OPTION>all terms";
print "<OPTION>as a phrase</SELECT> ";
}
elsif ($FORM{'boolean'} eq 'all terms') {
print "<OPTION>any terms<OPTION SELECTED>all terms";
print "<OPTION>as a phrase</SELECT> ";
}
else {
print "<OPTION>any terms<OPTION>all terms";
print "<OPTION SELECTED>as a phrase</SELECT> ";
}
print "Case: <SELECT NAME=\"case\"> ";
if ($FORM{'case'} eq 'insensitive') {
print "<OPTION SELECTED>insensitive<OPTION>sensitive</SELECT>\n";
}
else {
print "<OPTION>insensitive<OPTION SELECTED>sensitive</SELECT>\n";
}
print "<P><INPUT TYPE=SUBMIT VALUE=\"Search\">";
print "</CENTER></FORM></P><HR>\n";
print "<P ALIGN=CENTER><SMALL>";
print "<A HREF=\"http://awsd.com/scripts/websearch/\">";
print "WebSearch $version</A>";
print "</SMALL></P></BODY></HTML>\n";
exit;
###############
# SUBROUTINES #
###############
# You shouldn't need to change these, either!
sub Get_Date {
$mtime = time unless ($mtime);
($mday,$mon,$yr) = (localtime($mtime))[3,4,5];
$date = "$mday $month[$mon] 19$yr";
return $date;
}
sub ByValue {
$aval=$truval{$a};
$bval=$truval{$b};
$bval<=>$aval;
}