[COMMIT LOGREPORT] service/www/lib/Lire/DlfAnalysers ReferrerCategoriser.pm,1.3,1.4

Joost van Baal vanbaal at users.sourceforge.net
Sun Jul 16 15:29:41 CEST 2006


Update of /cvsroot/logreport/service/www/lib/Lire/DlfAnalysers
In directory sc8-pr-cvs6.sourceforge.net:/tmp/cvs-serv2296

Modified Files:
	ReferrerCategoriser.pm 
Log Message:

No longer crash on %-escaped stuff in &q=...& field in search url which
is outside UTF-8 range.  We now check if the search engine gives us the
name of a non-utf8 encoding.  If so, we don't even try to decode the
search string.  For now, we only deal with google: we don't know how
other searchengines pass encoding-information to us.  This is a partial
fix for http://bugs.debian.org/291063 .



Index: ReferrerCategoriser.pm
===================================================================
RCS file: /cvsroot/logreport/service/www/lib/Lire/DlfAnalysers/ReferrerCategoriser.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- ReferrerCategoriser.pm	16 Jul 2006 12:02:52 -0000	1.3
+++ ReferrerCategoriser.pm	16 Jul 2006 13:29:39 -0000	1.4
@@ -60,33 +60,35 @@
 
 # Order is important
 # Because google.yahoo.com isn't the same as www.google.com
+#
+#   $host_match,         $param, $encoding, $engine
 my @Engine2Keywords = 
   (
-   ["yahoo.com",         'p',          "Yahoo!"],
-   ["altavista.com",     'q',          "AltaVista"],
-   ["google",            'q',          "Google"],
-   ["google",            'query',      "Google"],
-   ["www.google",        'q',          "Google"],
-   ["aol.com",           'query',      "AOL NetFind"],
-   ["eureka.com",        'q',          "Eureka"],
-   ["lycos.com",         'query',      "Lycos"],
-   ["hotbot.com",        'MT',         "HotBot"],
-   ["msn.com",           'MT',         "Microsoft Network"],
-   ["infoseek.com",      'qt',         "InfoSeek"],
-   ["webcrawler",        'searchText', "WebCrawler"],
-   [ "excite",           'search',     "Excite"],
-   ["netscape.com",      'search',     "Netscape"],
-   ["mamma.com",         'query',      "Mamma"],
-   ["alltheweb.com",     'query',      "All The Web"],
-   ["northernlight.com", 'qr',         "Northern Light"],
-   ["askjeeves.com",     'ask',        "Ask Jeeves"],
-   ["looksmart.com",     'key',        "Look Smart"],
-   ["goto.com",          'key',        "Look Smart"],
-   ["overture.com",      'Keywords',   "Overture"],
-   ["about.com",         'terms',      "About.COM"],
-   ["metacrawler.com",   'general',    "Meta Crawler"],
-   ["about.com",         'terms',      "About.COM"],
-   ["iwon.com",          'searchfor',  "iWon"],
+   ["yahoo.com",         'p',          '',   "Yahoo!"],
+   ["altavista.com",     'q',          '',   "AltaVista"],
+   ["google",            'q',          'ie', "Google"],
+   ["google",            'query',      'ie', "Google"],
+   ["www.google",        'q',          'ie', "Google"],
+   ["aol.com",           'query',      '',   "AOL NetFind"],
+   ["eureka.com",        'q',          '',   "Eureka"],
+   ["lycos.com",         'query',      '',   "Lycos"],
+   ["hotbot.com",        'MT',         '',   "HotBot"],
+   ["msn.com",           'MT',         '',   "Microsoft Network"],
+   ["infoseek.com",      'qt',         '',   "InfoSeek"],
+   ["webcrawler",        'searchText', '',   "WebCrawler"],
+   ["excite",            'search',     '',   "Excite"],
+   ["netscape.com",      'search',     '',   "Netscape"],
+   ["mamma.com",         'query',      '',   "Mamma"],
+   ["alltheweb.com",     'query',      '',   "All The Web"],
+   ["northernlight.com", 'qr',         '',   "Northern Light"],
+   ["askjeeves.com",     'ask',        '',   "Ask Jeeves"],
+   ["looksmart.com",     'key',        '',   "Look Smart"],
+   ["goto.com",          'key',        '',   "Look Smart"],
+   ["overture.com",      'Keywords',   '',   "Overture"],
+   ["about.com",         'terms',      '',   "About.COM"],
+   ["metacrawler.com",   'general',    '',   "Meta Crawler"],
+   ["about.com",         'terms',      '',   "About.COM"],
+   ["iwon.com",          'searchfor',  '',   "iWon"],
   );
 
 sub categorise {
@@ -109,12 +111,25 @@
 
     my $host = $parsed_url->{'host'};
     foreach my $spec ( @Engine2Keywords ) {
-	my ( $host_match, $param, $engine ) = @$spec;
+	my ( $host_match, $param, $encoding, $engine ) = @$spec;
 	next if index( lc $host, $host_match ) == -1;
 	next unless $parsed_url->{'query'} =~ /$param=(.*?)([;&]|$)/;
 	my $keywords = $1;
+        my $enc;
+	if ($encoding) {
+             # we know the way the current search engine passes information about
+             # the used character encoding
+             $parsed_url->{'query'} =~ /$encoding=(.*?)([;&]|$)/ and $enc = $1;
+        }
 	$keywords =~ tr/+/ /s;
-	$keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg;  # FIXME: take &ie=ISO-8859-1& into account here!  We wrongly assume anything is UTF8-encoded here.
+        # if the keywords are encoded in non-utf8, enc specifies the encoding used.
+        #
+        # (We've seen ISO-8859-1:
+        # http://www.google.fr/search?num=100&hl=fr&ie=ISO-8859-1&q=images+de+synth%E8se&btnG=Rechercher&meta=
+        # )
+	$keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg unless $enc;
+        # FIXME: take &ie=ISO-8859-1& into account here!
+        # We don't even try to manage anything which has a non-UTF8-encoding here.
         $dlf->{'search_engine'} = $engine;
         # we've seen
         # http://www.google.com/search?as_q=&num=10&btnG=Google+Search&\

-- 
To UNSUBSCRIBE, email to commit-request at logreport.org with a subject of 
"unsubscribe". Trouble? Send an email with subject "help" to 
commit-request at logreport.org



More information about the Commit mailing list