# #=============================================================================== # # FILE: searchengine.pm # # DESCRIPTION: # # FILES: --- # BUGS: --- # NOTES: --- # AUTHOR: Stefan Suhren (su), suhren.stefan@fh-swf.de # ORGANIZATION: FH Südwestfalen, Iserlohn # VERSION: 1.0 # CREATED: 02.12.2015 14:29:44 # REVISION: --- #=============================================================================== package searchengine; use strict; use warnings; use utf8; # use common import function use Exporter; # enforce utf-8 mode binmode (STDIN, ":encoding(UTF-8)"); binmode (STDOUT, ":encoding(UTF-8)"); binmode (STDERR, ":encoding(UTF-8)"); use open ":encoding(UTF-8)"; # add exporter as a parent for this package. our @ISA= qw( Exporter ); # these CAN be exported. our @EXPORT_OK = qw( buildStoplist getWords ); # these are exported by default. our @EXPORT = qw( buildStoplist getWords ); sub buildStoplist { my ( $stoplistFileName, $stoplist ) = @_; defined $stoplistFileName or die "StoplistFile must be supplied."; defined $stoplist or die "Stoplist hash must be supplied."; open my $stoplistFile, '<', $stoplistFileName or die "$0 : failed to open input file '$stoplistFileName' : $!\n"; while( my $word = <$stoplistFile> ) { chomp $word; $stoplist->{$word} = ''; } close $stoplistFile or warn "$0 : failed to close input file '$stoplistFileName' : $!\n"; } ## --- end sub buildStoplist sub getWords { my ( $text, $stoplist ) = @_; defined $text or die "Text must be supplied"; defined $stoplist or die "Stoplist hash must be supplied"; # Split at whitespaces my @words = split /[[:space:]]+/, $text; # Apply regex @words = map /([[:lower:]]{3,})/i , @words; # Convert to lower case @words = map {lc $_} @words; # Remove all words that are in the stoplist @words = map {!exists $stoplist->{$_} ? ($_) : ()} @words; return @words; } ## --- end sub getWords