summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Aufgabe5/searchengine.pm26
-rw-r--r--Aufgabe5/skript3.pl60
2 files changed, 84 insertions, 2 deletions
diff --git a/Aufgabe5/searchengine.pm b/Aufgabe5/searchengine.pm
index 9a1690b..9a70367 100644
--- a/Aufgabe5/searchengine.pm
+++ b/Aufgabe5/searchengine.pm
@@ -33,10 +33,10 @@ use open ":encoding(UTF-8)";
our @ISA= qw( Exporter );
# these CAN be exported.
-our @EXPORT_OK = qw( buildStoplist );
+our @EXPORT_OK = qw( buildStoplist getWords );
# these are exported by default.
-our @EXPORT = qw( buildStoplist );
+our @EXPORT = qw( buildStoplist getWords );
sub buildStoplist {
my ( $stoplistFileName, $stoplist ) = @_;
@@ -57,3 +57,25 @@ sub buildStoplist {
or warn "$0 : failed to close input file '$stoplistFileName' : $!\n";
} ## --- end sub buildStoplist
+
+
+sub getWords {
+ my ( $text, $stoplist ) = @_;
+
+ defined $text or die "Text must be supplied";
+ defined $stoplist or die "Stoplist hash must be supplied";
+
+ # Split at whitespaces
+ my @words = split /[[:space:]]+/, $text;
+
+ # Apply regex
+ @words = map /([[:lower:]]{3,})/i , @words;
+
+ # Convert to lower case
+ @words = map {lc $_} @words;
+
+ # Remove all words that are in the stoplist
+ @words = map {!exists $stoplist->{$_} ? ($_) : ()} @words;
+
+ return @words;
+} ## --- end sub getWords
diff --git a/Aufgabe5/skript3.pl b/Aufgabe5/skript3.pl
new file mode 100644
index 0000000..00927c6
--- /dev/null
+++ b/Aufgabe5/skript3.pl
@@ -0,0 +1,60 @@
+#!/usr/bin/env perl
+#===============================================================================
+#
+# FILE: skript3.pl
+#
+# USAGE: ./skript3.pl
+#
+# DESCRIPTION:
+#
+# OPTIONS: ---
+# REQUIREMENTS: ---
+# BUGS: ---
+# NOTES: ---
+# AUTHOR: Stefan Suhren (su), suhren.stefan@fh-swf.de
+# ORGANIZATION: FH Südwestfalen, Iserlohn
+# VERSION: 1.0
+# CREATED: 02.12.2015 14:14:02
+# REVISION: ---
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+# Add own module
+use searchengine;
+
+# For dumping data
+use Data::Dumper;
+
+# enforce utf-8 mode
+binmode (STDIN, ":encoding(UTF-8)");
+binmode (STDOUT, ":encoding(UTF-8)");
+binmode (STDERR, ":encoding(UTF-8)");
+use open ":encoding(UTF-8)";
+
+my $stoplistFile = "stoplist.txt";
+my %stoplist;
+
+buildStoplist($stoplistFile, \%stoplist);
+
+for (my $i = 1; $i <= 4; $i++)
+{
+ my $wordFileName = 'doc.einfach/doc'.$i.'.txt'; # input file name
+
+ open my $wordFile, '<', $wordFileName
+ or die "$0 : failed to open input file '$wordFileName' : $!\n";
+
+ # Read in slurp mode
+ my $fileContent = do{
+ local $/ = undef;
+ <$wordFile>;
+ };
+
+ close $wordFile
+ or warn "$0 : failed to close input file '$wordFileName' : $!\n";
+
+ print $wordFileName . "\n";
+ print Dumper(getWords($fileContent, \%stoplist));
+}