summaryrefslogtreecommitdiffstats
path: root/Aufgabe5/searchengine.pm
blob: 9a70367fbf814aee6191787d7aa68a732a56b173 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#
#===============================================================================
#
#         FILE: searchengine.pm
#
#  DESCRIPTION: 
#
#        FILES: ---
#         BUGS: ---
#        NOTES: ---
#       AUTHOR: Stefan Suhren (su), suhren.stefan@fh-swf.de
# ORGANIZATION: FH Südwestfalen, Iserlohn
#      VERSION: 1.0
#      CREATED: 02.12.2015 14:29:44
#     REVISION: ---
#===============================================================================
package searchengine;

use strict;
use warnings;
use utf8;

# use common import function
use Exporter;

# enforce utf-8 mode
binmode (STDIN, ":encoding(UTF-8)");
binmode (STDOUT, ":encoding(UTF-8)");
binmode (STDERR, ":encoding(UTF-8)");
use open ":encoding(UTF-8)";

# add exporter as a parent for this package.
our @ISA= qw( Exporter );

# these CAN be exported.
our @EXPORT_OK = qw( buildStoplist getWords );

# these are exported by default.
our @EXPORT = qw( buildStoplist getWords );

sub buildStoplist {
	my ( $stoplistFileName, $stoplist ) = @_;
	
	defined $stoplistFileName or die "StoplistFile must be supplied.";
	defined $stoplist or die "Stoplist hash must be supplied.";

	open  my $stoplistFile, '<', $stoplistFileName
		or die  "$0 : failed to open  input file '$stoplistFileName' : $!\n";

	while( my $word = <$stoplistFile> )
	{
		chomp $word;
		$stoplist->{$word} = '';
	}

	close  $stoplistFile
		or warn "$0 : failed to close input file '$stoplistFileName' : $!\n";

} ## --- end sub buildStoplist


sub getWords {
	my	( $text, $stoplist ) = @_;
	
	defined $text or die "Text must be supplied";
	defined $stoplist or die "Stoplist hash must be supplied";

	# Split at whitespaces
	my @words = split /[[:space:]]+/, $text;

	# Apply regex
	@words = map /([[:lower:]]{3,})/i , @words;

	# Convert to lower case
	@words = map {lc $_} @words;

	# Remove all words that are in the stoplist
	@words = map {!exists $stoplist->{$_} ? ($_) : ()} @words;

	return @words;
} ## --- end sub getWords