# Display titles of URLs, generate HTML page
#
# based on url_log 0.2 by Thomas Graf
#
# infected with the gpl virus
#
# Dan Boger
# version: 1.0
use strict;
use Irssi;
use Irssi::Irc;
use File::Temp qw/tempfile/;
use File::Copy;
use vars qw/$VERSION %IRSSI/;
$VERSION = "1.1";
%IRSSI = (
authors => 'Dan Boger, based on code by Thomas Graf',
contact => 'zigdon@gmail.com',
name => 'url_log',
description => 'Fetches HTML pages titles, generates an HTML index',
license => 'GNU GPLv2 or later',
url => 'http://home.peeron.com/~dan/hacks/url_log.html',
);
use LWP;
use LWP::UserAgent;
use HTML::TreeBuilder;
use POSIX qw(strftime);
use strict;
$SIG{CHLD} = 'IGNORE';
my %seen;
my @url_data;
my %channels;
my $user_agent = new LWP::UserAgent( max_size => 8196 );
my $last_save_time = 0;
my $pid;
$user_agent->agent("IrssiUrlLog/$VERSION");
sub expand {
my ( $string, %format ) = @_;
my ( $exp, $repl );
$string =~ s/%$exp/$repl/g while ( ( $exp, $repl ) = each(%format) );
return $string;
}
sub print_msg {
Irssi::active_win()->print("@_");
}
sub title {
my ($url) = @_;
# get the content-type first, so we only check titles for html
my $req = new HTTP::Request HEAD => $url;
my $res = $user_agent->request($req);
return "" unless $res;
return "Content type:"
. $res->header("Content_type") . " ("
. $res->header("Content_length")
. " bytes)"
unless $res->header("Content_type") =~ m#text/|application/x-httpd-php#;
$req = new HTTP::Request GET => $url;
$res = $user_agent->request($req);
return "" unless $res;
my $tree = HTML::TreeBuilder->new_from_content( $res->content );
return "" unless $tree;
my $title = $tree->look_down( _tag => "title" );
$title = $title->as_text if $title;
$tree->delete;
return $title || "";
}
#
# called if url is detected, should get the title
#
sub do_auto_title {
my ( $url, $window ) = @_;
return if ( $url !~ /^https?:\/\// );
return if $seen{$url} and time - $seen{$url} < 600;
$seen{$url} = time;
my ( $fh, $filename ) = tempfile();
$pid = fork();
Irssi::pidwait_add($pid);
if ($pid) { # parent
Irssi::timeout_add_once( 1000, 'monitor_log', [ $filename, $window ] );
}
elsif ( defined $pid ) { # child
close STDIN;
close STDOUT;
close STDERR;
my $format = Irssi::settings_get_str("url_title_format");
my $title = title($url);
$format =~ s/\%u/$url/g;
$format =~ s/\%t/$title/eg;
print $fh "$url = $title \t$format";
close $fh;
exit;
}
}
sub monitor_log {
my ($data) = @_;
my ( $filename, $window ) = @$data;
if ( open( FH, $filename ) ) {
my $line = ;
chomp $line;
close FH;
if ($line) {
$line =~ s/^(\S+) = ([^\t]+)\t//;
my ( $url, $title ) = ( $1, $2 );
$line =~ s/%/%%/g;
$window->print("$line");
unlink $filename;
if ( Irssi::settings_get_str('url_log_html') ) {
my $i = $#url_data + 1;
foreach ( reverse @url_data ) {
$i--;
next unless $_->[3] eq $url;
last if $_->[5];
$_->[5] = $title;
$url_data[$i] = $_;
&generate_html;
last;
}
}
Irssi::pidwait_remove($pid);
}
else {
Irssi::timeout_add_once( 1000, 'monitor_log',
[ $filename, $window ] );
}
}
}
#
# log url to file
#
sub log_to_file {
my ( $nick, $target, $text ) = @_;
my ($lfile) = glob Irssi::settings_get_str("url_log_file");
if ( open( LFD, ">> $lfile" ) ) {
my %h = {
time => time,
nick => $nick,
target => $target,
url => $text
};
print LFD expand(
Irssi::settings_get_str("url_log_format"),
"s",
strftime(
Irssi::settings_get_str("url_log_timestamp_format"), localtime
),
"n", $nick, "t", $target, "u", $text
),
"\n";
close LFD;
}
else {
print_msg "Warning: Unable to open file $lfile $!";
}
}
#
# msg handler
#
sub sig_msg {
my ( $server, $data, $nick, $address ) = @_;
my ( $target, $text ) = split( / :/, $data, 2 );
while (
$text =~ s{
(^|.*?\s) # $1 - leading comments
( # $2 - URL
(?: # - domain
(?:https?://[\w\.-]+ | # stuff starting with http
(?:[\w-]+\.){2,}[a-z]+) # something.like.this
)
(?: # - path (optional)
/ [/\w~.%;&?=-]*
)?
)
($|\s.*) # $3 - trailing comments
}{$1 $3}xi
)
{
my ( $leading, $url, $trailing ) = ( $1, $2, $3 );
my $text = $leading || $trailing;
$text =~ s/.* //;
$text =~ s/^\s+|\s+$//g;
foreach ( keys %seen ) {
next if time - $seen{$_} < 600;
delete $seen{$_};
}
my $ischannel = $server->ischannel($target);
my $channel = $target;
my $level = $ischannel ? MSGLEVEL_PUBLIC: MSGLEVEL_MSGS;
$target = $nick unless $ischannel;
my $window = $server->window_find_closest( $target, $level );
if ( Irssi::settings_get_bool("url_log_auto_head") ) {
do_auto_title( $url, $window );
}
if ( Irssi::settings_get_bool("url_log") ) {
log_to_file( $nick, $target, $url );
}
if ( Irssi::settings_get_str('url_log_html') ) {
# time, chan, who, url, text
push @url_data,
[
time, ( $ischannel ? $channel : "N/A" ),
$nick, $url, $text, undef
];
$channels{$ischannel ? $channel : "N/A"} ++;
&generate_html;
if (
time - $last_save_time >
Irssi::settings_get_int('url_log_save_interval') )
{
&save_url_data;
}
}
}
}
#
# url command handler
#
sub sig_url {
my ( $cmd_line, $server, $win_item ) = @_;
my @args = split( ' ', $cmd_line );
if ( @args <= 0 ) {
&help;
return;
}
my $action = lc( shift(@args) );
if ( $action eq 'save' ) {
&save_url_data;
print_msg "URL data saved";
}
elsif ( $action eq 'load' ) {
&load_url_data;
&generate_html;
print_msg "URL data loaded";
}
elsif ( $action eq 'regenerate' ) {
&generate_html;
print_msg "HTML page regenerated";
}
elsif ( $action eq 'debug' ) {
print_msg "URL cache (\%seen): ". scalar keys(%seen) . " urls.";
print_msg "URL list (\@url_data): ". scalar @url_data . " urls.";
}
else {
print_msg "Unknown action";
}
}
sub save_url_data {
my $filename = Irssi::settings_get_str('url_log_data_file');
unless ( open( SAVE, ">$filename.new" ) ) {
print_msg "Failed to write $filename.new: $!";
return;
}
foreach (@url_data) {
next
if time - $_->[0] >
Irssi::settings_get_int('url_log_html_days') * 24 * 60 * 60;
print SAVE join "\t", @$_;
print SAVE "\n";
}
close SAVE;
move "$filename.new", $filename;
}
sub load_url_data {
my $filename = Irssi::settings_get_str('url_log_data_file');
unless ( open( SAVE, $filename ) ) {
print_msg "Failed to read $filename: $!";
return;
}
@url_data = ();
while () {
chomp;
my @line = split /\t/, $_, 6;
next
if time - $line[0] >
Irssi::settings_get_int('url_log_html_days') * 24 * 60 * 60;
push @url_data, [@line]; # time, chan, who, url, text, title
$channels{$line[1]}++;
}
close SAVE;
}
sub generate_html {
my ( $cmd_line, $server, $win_item ) = @_;
my @args = split( ' ', $cmd_line );
my $filename = Irssi::settings_get_str('url_log_html');
unless ( open( HTML, ">$filename.new" ) ) {
print_msg "Failed to write $filename.new: $!";
return;
}
binmode HTML, ':utf8';
print HTML <URL log for $ENV{USER}
EOT
print HTML "
| ";
foreach ("ALL", sort keys %channels) {
print HTML qq{$_ | };
}
print HTML "
";
print HTML qq{Click on the date to fold/unfold};
my $lastday = undef;
foreach my $entry ( reverse @url_data ) {
my ( $time, $chan, $who, $url, $text, $title ) = @$entry;
$title ||= $url;
last
if time - $time >
Irssi::settings_get_int('url_log_html_days') * 24 * 60 * 60;
my @time = localtime($time);
if ( $lastday and $time[3] ne $lastday ) {
print HTML "\n";
}
if ( not $lastday or $time[3] != $lastday ) {
my @wday =
qw/Sunday Monday Tuesday Wednesday Thursday Friday Saturday/;
printf HTML '
[ ';
printf HTML '%02d:%02d |', @time[ 2, 1 ];
print HTML qq{ $chan |};
print HTML qq{ $who ] };
print HTML qq{$title} if $title;
print HTML qq{ - $text } if $text;
print HTML "