From 75ac48742bdf2826a135375fe1acf5f399bbbaf3 Mon Sep 17 00:00:00 2001 From: Stefan Suhren Date: Wed, 2 Dec 2015 16:52:58 +0100 Subject: Create word array and test it --- Aufgabe5/searchengine.pm | 26 +++++++++++++++++++-- Aufgabe5/skript3.pl | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 Aufgabe5/skript3.pl diff --git a/Aufgabe5/searchengine.pm b/Aufgabe5/searchengine.pm index 9a1690b..9a70367 100644 --- a/Aufgabe5/searchengine.pm +++ b/Aufgabe5/searchengine.pm @@ -33,10 +33,10 @@ use open ":encoding(UTF-8)"; our @ISA= qw( Exporter ); # these CAN be exported. -our @EXPORT_OK = qw( buildStoplist ); +our @EXPORT_OK = qw( buildStoplist getWords ); # these are exported by default. -our @EXPORT = qw( buildStoplist ); +our @EXPORT = qw( buildStoplist getWords ); sub buildStoplist { my ( $stoplistFileName, $stoplist ) = @_; @@ -57,3 +57,25 @@ sub buildStoplist { or warn "$0 : failed to close input file '$stoplistFileName' : $!\n"; } ## --- end sub buildStoplist + + +sub getWords { + my ( $text, $stoplist ) = @_; + + defined $text or die "Text must be supplied"; + defined $stoplist or die "Stoplist hash must be supplied"; + + # Split at whitespaces + my @words = split /[[:space:]]+/, $text; + + # Apply regex + @words = map /([[:lower:]]{3,})/i , @words; + + # Convert to lower case + @words = map {lc $_} @words; + + # Remove all words that are in the stoplist + @words = map {!exists $stoplist->{$_} ? ($_) : ()} @words; + + return @words; +} ## --- end sub getWords diff --git a/Aufgabe5/skript3.pl b/Aufgabe5/skript3.pl new file mode 100644 index 0000000..00927c6 --- /dev/null +++ b/Aufgabe5/skript3.pl @@ -0,0 +1,60 @@ +#!/usr/bin/env perl +#=============================================================================== +# +# FILE: skript3.pl +# +# USAGE: ./skript3.pl +# +# DESCRIPTION: +# +# OPTIONS: --- +# REQUIREMENTS: --- +# BUGS: --- +# NOTES: --- +# AUTHOR: Stefan Suhren (su), suhren.stefan@fh-swf.de +# ORGANIZATION: FH Südwestfalen, Iserlohn +# VERSION: 1.0 +# CREATED: 02.12.2015 14:14:02 +# REVISION: --- +#=============================================================================== + +use strict; +use warnings; +use utf8; + +# Add own module +use searchengine; + +# For dumping data +use Data::Dumper; + +# enforce utf-8 mode +binmode (STDIN, ":encoding(UTF-8)"); +binmode (STDOUT, ":encoding(UTF-8)"); +binmode (STDERR, ":encoding(UTF-8)"); +use open ":encoding(UTF-8)"; + +my $stoplistFile = "stoplist.txt"; +my %stoplist; + +buildStoplist($stoplistFile, \%stoplist); + +for (my $i = 1; $i <= 4; $i++) +{ + my $wordFileName = 'doc.einfach/doc'.$i.'.txt'; # input file name + + open my $wordFile, '<', $wordFileName + or die "$0 : failed to open input file '$wordFileName' : $!\n"; + + # Read in slurp mode + my $fileContent = do{ + local $/ = undef; + <$wordFile>; + }; + + close $wordFile + or warn "$0 : failed to close input file '$wordFileName' : $!\n"; + + print $wordFileName . "\n"; + print Dumper(getWords($fileContent, \%stoplist)); +} -- cgit v1.2.3-70-g09d2