[COMMIT LOGREPORT] service/www/lib/Lire/DlfAnalysers ReferrerCategoriser.pm,1.3,1.4
Joost van Baal
vanbaal at users.sourceforge.net
Sun Jul 16 15:29:41 CEST 2006
Update of /cvsroot/logreport/service/www/lib/Lire/DlfAnalysers
In directory sc8-pr-cvs6.sourceforge.net:/tmp/cvs-serv2296
Modified Files:
ReferrerCategoriser.pm
Log Message:
No longer crash on %-escaped stuff in &q=...& field in search url which
is outside UTF-8 range. We now check if the search engine gives us the
name of a non-utf8 encoding. If so, we don't even try to decode the
search string. For now, we only deal with google: we don't know how
other searchengines pass encoding-information to us. This is a partial
fix for http://bugs.debian.org/291063 .
Index: ReferrerCategoriser.pm
===================================================================
RCS file: /cvsroot/logreport/service/www/lib/Lire/DlfAnalysers/ReferrerCategoriser.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- ReferrerCategoriser.pm 16 Jul 2006 12:02:52 -0000 1.3
+++ ReferrerCategoriser.pm 16 Jul 2006 13:29:39 -0000 1.4
@@ -60,33 +60,35 @@
# Order is important
# Because google.yahoo.com isn't the same as www.google.com
+#
+# $host_match, $param, $encoding, $engine
my @Engine2Keywords =
(
- ["yahoo.com", 'p', "Yahoo!"],
- ["altavista.com", 'q', "AltaVista"],
- ["google", 'q', "Google"],
- ["google", 'query', "Google"],
- ["www.google", 'q', "Google"],
- ["aol.com", 'query', "AOL NetFind"],
- ["eureka.com", 'q', "Eureka"],
- ["lycos.com", 'query', "Lycos"],
- ["hotbot.com", 'MT', "HotBot"],
- ["msn.com", 'MT', "Microsoft Network"],
- ["infoseek.com", 'qt', "InfoSeek"],
- ["webcrawler", 'searchText', "WebCrawler"],
- [ "excite", 'search', "Excite"],
- ["netscape.com", 'search', "Netscape"],
- ["mamma.com", 'query', "Mamma"],
- ["alltheweb.com", 'query', "All The Web"],
- ["northernlight.com", 'qr', "Northern Light"],
- ["askjeeves.com", 'ask', "Ask Jeeves"],
- ["looksmart.com", 'key', "Look Smart"],
- ["goto.com", 'key', "Look Smart"],
- ["overture.com", 'Keywords', "Overture"],
- ["about.com", 'terms', "About.COM"],
- ["metacrawler.com", 'general', "Meta Crawler"],
- ["about.com", 'terms', "About.COM"],
- ["iwon.com", 'searchfor', "iWon"],
+ ["yahoo.com", 'p', '', "Yahoo!"],
+ ["altavista.com", 'q', '', "AltaVista"],
+ ["google", 'q', 'ie', "Google"],
+ ["google", 'query', 'ie', "Google"],
+ ["www.google", 'q', 'ie', "Google"],
+ ["aol.com", 'query', '', "AOL NetFind"],
+ ["eureka.com", 'q', '', "Eureka"],
+ ["lycos.com", 'query', '', "Lycos"],
+ ["hotbot.com", 'MT', '', "HotBot"],
+ ["msn.com", 'MT', '', "Microsoft Network"],
+ ["infoseek.com", 'qt', '', "InfoSeek"],
+ ["webcrawler", 'searchText', '', "WebCrawler"],
+ ["excite", 'search', '', "Excite"],
+ ["netscape.com", 'search', '', "Netscape"],
+ ["mamma.com", 'query', '', "Mamma"],
+ ["alltheweb.com", 'query', '', "All The Web"],
+ ["northernlight.com", 'qr', '', "Northern Light"],
+ ["askjeeves.com", 'ask', '', "Ask Jeeves"],
+ ["looksmart.com", 'key', '', "Look Smart"],
+ ["goto.com", 'key', '', "Look Smart"],
+ ["overture.com", 'Keywords', '', "Overture"],
+ ["about.com", 'terms', '', "About.COM"],
+ ["metacrawler.com", 'general', '', "Meta Crawler"],
+ ["about.com", 'terms', '', "About.COM"],
+ ["iwon.com", 'searchfor', '', "iWon"],
);
sub categorise {
@@ -109,12 +111,25 @@
my $host = $parsed_url->{'host'};
foreach my $spec ( @Engine2Keywords ) {
- my ( $host_match, $param, $engine ) = @$spec;
+ my ( $host_match, $param, $encoding, $engine ) = @$spec;
next if index( lc $host, $host_match ) == -1;
next unless $parsed_url->{'query'} =~ /$param=(.*?)([;&]|$)/;
my $keywords = $1;
+ my $enc;
+ if ($encoding) {
+ # we know the way the current search engine passes information about
+ # the used character encoding
+ $parsed_url->{'query'} =~ /$encoding=(.*?)([;&]|$)/ and $enc = $1;
+ }
$keywords =~ tr/+/ /s;
- $keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg; # FIXME: take &ie=ISO-8859-1& into account here! We wrongly assume anything is UTF8-encoded here.
+ # if the keywords are encoded in non-utf8, enc specifies the encoding used.
+ #
+ # (We've seen ISO-8859-1:
+ # http://www.google.fr/search?num=100&hl=fr&ie=ISO-8859-1&q=images+de+synth%E8se&btnG=Rechercher&meta=
+ # )
+ $keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg unless $enc;
+ # FIXME: take &ie=ISO-8859-1& into account here!
+ # We don't even try to manage anything which has a non-UTF8-encoding here.
$dlf->{'search_engine'} = $engine;
# we've seen
# http://www.google.com/search?as_q=&num=10&btnG=Google+Search&\
--
To UNSUBSCRIBE, email to commit-request at logreport.org with a subject of
"unsubscribe". Trouble? Send an email with subject "help" to
commit-request at logreport.org
More information about the Commit
mailing list