Приглашаем посетить
Грин (grin.lit-info.ru)

Metaphone

#!/usr/local/bin/perl

#while (1) {
    %matches = ();
#    print "File to check? ";
#    $file = <STDIN>;
$file = "../survey.log\n";
    chop($file);
#    print "Word to check? ";
#    $word = <STDIN>;
$word = "matt\n";
    chop($word);
    
    # Convert search word to metaphone
    
    ($check = &metaphone($word)) || die "$word is not a valid word";
    $check_first = substr($check, 0, 1);
    $first_match = " " . $check_first;
    
    # Read through file
    
    open(FILE, "<$file") || die "Could not open $file";
    while (<FILE>) {
        next if /^$/;
        
        # Convert to uppercase, remove non-alpha & dupes, prefix
        
        $_ = ' ' . $_;
        tr/a-zA-Z/ /cs;
        $old_line = $_;
        tr/a-z/A-Z/;
        tr/G/-/;  
        tr/A-Z/A-Z/s;
        tr/-/G/;
	    s/ ([GKP](N)|A(E)|W(R)|W(HO)|([GW])H)/ $2$3$4$5$6/;
	    s/ X/ S-/;
        
        # Process each word with matching first letter
        
        $start_word = index($_, $first_match) + 1;

        while ($start_word > $[) {

            $end_word = index($_, ' ', $start_word + 1);
            $word_length = $end_word - $start_word;
            $fileword = substr($_, $start_word, $word_length);

            # Process if word is long enough
            
$startt = (times)[0];
            if ($word_length > 2) {
                                
                # Convert to metaphone if the word is new
                    
                if (!$converted{$fileword}) { 
                    $converted{$fileword} = &metaphone($fileword);
                }
                # Check for match
                   
                if ($check eq $converted{$fileword}) { 
                    
                    # Find original word
                    
                    $save_line = $_;
                    $_ = substr($_, 0, $start_word);
                    $word_num = tr/ / /;
                    $_ = $save_line;
                    for ($pos = -1; $word_num; --$word_num) {
                        $pos = index($old_line, ' ', $pos + 1);
                    }
                    ++$pos;
                    $endpos = index($old_line, ' ', $pos);
                    $fileword = substr($old_line, $pos, $endpos - $pos);
                    
                    # Increment match count
                    
                    $matches{$fileword}++;
                }                
            }
            $start_word = index($_, $first_match, $end_word);
$total += (times)[0] - $startt;
        }
    }
    print "\n";
print "Total time: $total\n";
#    print "The following words in $file match $word:\n";
#    foreach $match (sort keys %matches) { 
#        print "    $match ($matches{$match})\n" 
#    }
#}


### Subroutine:    &metaphone()
###
### Function:      To convert a word to its phonetic equivalent using
###                a slightly modified version of the Metaphone 
###                algorithm, originally developed by Lawrence Philips
###
### Usage:         &metaphone($word);
###
### Variables:     $word -- Word to be converted. 
###                         Example "cookbook"
###
### Returns:       String containing converted result if successful
###                Null string if unsuccessful ($word not a word) 
###
### Files Created: None

sub metaphone {

    # Initialize variables
    
    local($word) = @_[0];
    
    # Convert to uppercase, remove non-alpha
    
    $word =~ tr/a-z/A-Z/;
    $word =~ tr/A-Z//cd;

    # Remove duplicate letters except GG
    
    $word =~ tr/G/-/;  
    $word =~ tr/A-Z/A-Z/s;
    $word =~ tr/-/G/;
    
    # Convert to Metaphone phonetic
    
    $word =~ s/^([GKP](N)|A(E)|W(R)|W(HO)|([GW])H)/$2$3$4$5$6/; # Prefix
    $word =~ s/^X/S-/;                                   
    $word =~ s/MB$/M/;                                          # B
    $word =~ s/X/KS-/g;                                         # X
    $word =~ s/TH/0/g;                                          # TH
    $word =~ s/SC([EIY])/S-$1/g;                                # C
    $word =~ s/C(IA)|SCH$|TCH|([^S])CH/$2X$1/g;
    $word =~ s/C([EIY])/-S-$1/g;    
    $word =~ s/G((N)|(NED))$/-$2$3/;                            # G
    $word =~ s/(D|([^G]))G([EIY])/$2J$3/g;
    $word =~ s/((H...)|([BDH]..))GH$/$2$3/;
    $word =~ s/(PH|V|GH$)/F/g;                                  # PH/V/GH
    $word =~ s/GHT/T/g;
    $word =~ s/(.)G(H[^AEIOU])/$1-$2/g;
    $word =~ s/([AEIOU])H([^AEIOU])/$1-$2/g;                    # H
    $word =~ s/SH/X/g;
    $word =~ s/[WY]($|[^AEIOU])/-$1/g;                          # W/Y
    $word =~ s/(C[HK]|C|Q|G+)/K/g;                              # CK/Q/G
    $word =~ s/(.)[ST](I[OA].)/$1X$2/g;                         # S/T
    $word =~ tr/Z/S/;                                           # Z
    $word =~ tr/D/T/;                                           # D

    # Remove vowels and placeholders
    
    $word =~ s/(.)[AEIOU-]+/$1/g;

    # Cut to four characters, pad with spaces if necessary
    
    if ($word) { $word = substr($word . "   ", 0, 4) }
    
    # Return the result (null if original $word was not a word)
    
    $word;
}