Harald: Dubletten löschen

Beitrag lesen

Hallo,

kann mir vielleicht jemand mit einem cgi script helfen ?
Es handelt sich dabei um ein cgi, welches doppelte Einträge aus einer Datenbank heraussuchen soll. Das Problem ist, es handelt sich dabei um 2,5 Millionen Datensätze und ich teile mir den Server mit vielen anderen. Wenn ich nun das Programm starte, wird eine riesige temporäre Mysql Tabelle angelegt, die fast den gesamten Server lahmlegt.

Das Script durchforstet die komplette Datenbank und ich möchte nun eine WHERE funktion einbauen, z.b. WHERE ID < 100000, damit immer nur ein Teil der Datenbank gechecked wird, weiss aber leider nicht, wo und wie ich die Abfrage einbauen muss. Kann mir da vielleicht jemand einen Tip geben ?

Hier das Script:

==================================================================

Plugins::CheckDuplicates - Auto Generated Program Module

#   Plugins::CheckDuplicates
#   Author  : Virginia
#   Version : 1
#   Updated : Fri Mar 14 14:04:43 2003

==================================================================

package Plugins::CheckDuplicates;

==================================================================

use strict;
    use GT::Base;
    use GT::Plugins qw/STOP CONTINUE/;
 use GT::Template;
    use Links qw/$CFG $IN $DB/;

Inherit from base class for debug and error methods

@Plugins::CheckDuplicates::ISA = qw(GT::Base);

Your code begins here! Good Luck!

ADMIN MENU OPTIONS

===================================================================

sub check_titles {

-------------------------------------------------------------------

This subroutine will get called whenever the user clicks

on 'Check Titles' in the admin menu. Remember, you need to print

your own content-type headers; you should use

#   print $IN->header();

my $params = &check_duplicates('Title');

Make sure a root is set.

my $opts ||= {};
    $opts->{root}  ||= "$CFG->{admin_root_path}/templates/admin";
    defined $opts->{print} or ($opts->{print} = 1);

Print the page.

print $IN->header();
    GT::Template->parse ( "plugins_duplicate.html", $params, $opts );

}
sub check_descriptions {

-------------------------------------------------------------------

This subroutine will get called whenever the user clicks

on 'Check Description' in the admin menu. Remember, you need to print

your own content-type headers; you should use

#   print $IN->header();

my $params = &check_duplicates('Description');

Make sure a root is set.

my $opts ||= {};
    $opts->{root}  ||= "$CFG->{admin_root_path}/templates/admin";
    defined $opts->{print} or ($opts->{print} = 1);

Print the page.

print $IN->header();
    GT::Template->parse ( "plugins_duplicate.html", $params, $opts );

}

sub check_duplicates {

------------------------------------------------------------------

Displays a list of duplicate URL's.

my $field = shift || "Title";
    my $db = $DB->table('Links');
    my $nh = $IN->param('nh') || 1;
    my $mh = $IN->param('mh') || 10;
    my $begin = ($nh - 1) * $mh;
    my $end   = $begin + $mh;

We turn on big tables as this is usually a large query for MySQL.

if (lc $db->{connect}->{driver} eq 'mysql') {
        my $sth = $db->prepare ("SET OPTION SQL_BIG_TABLES = 1");
        $sth->execute;
    }

Now get URL's and Counts.

$db->select_options ("GROUP BY $field", "ORDER BY hits DESC");
    my $sth     = $db->select ( [$field, 'COUNT(*) AS hits'], GT::SQL::Condition->new ($field, '<>', "") );
    my $row_num = -1;
    my $total   = 0;
    my $dupes   = '';
    my %seen;
    while (my ($url, $count) = $sth->fetchrow_array) {
        last if ($count == 1);
        $total += $count;
        $row_num++;
        $seen{$url} = 1;
        next if ($row_num < $begin);
        last if ($row_num >= $end);

my $sth2 = $db->select( { $field => $url }, ['ID','URL',"$field"] );
        $dupes .= qq~
<tr><td bgcolor="#dddddd"><font face="Tahoma,Arial,Helvetica" size="2"><b>$url - $count</b></font>
</td></tr>
        ~;
        while (my ($id, $url2, $title) = $sth2->fetchrow_array) {
            my $cats = $db->get_categories($id);
            my ($cid, $cname) = each %$cats;

$dupes .= qq~
<tr><td valign=top>
    <font face="Tahoma,Arial,Helvetica" size="2">
        <input type="checkbox" name="delete" value="$id">
        <input type="hidden" name="$id-ID" value="$id">
        $id - <a href="$url2">$title</a> - $cname -
        [ <a href="admin.cgi?do=modify_form&db=Links&modify=1&1-ID=$id">Modify</a> ]
    </font>
</td></tr>
            ~;
        }
    }
    while (my ($url, $count) = $sth->fetchrow_array) {
        last if ($count == 1);
        $seen{$url} = 1;
        $total += $count;
    }
    my $url     = $IN->url ( query_string => 1, absolute => 0 );
    my $html    = $DB->html(['Links'], $IN);
    my $hits    = scalar keys %seen;
    my $toolbar = $html->toolbar($nh, $mh, $hits, $url);

return { total => $total, output => $dupes, toolbar => $toolbar, number_urls => $hits };
}

Always end with a 1.

1;