Eskil

Artifact [fe94732ae7]
Login

Artifact fe94732ae7801a6f3a0f97cf51c9d46cb26a0ce3:


#----------------------------------------------------------------------
#  Eskil, comparing
#
#  Copyright (c) 1998-2005, Peter Spjuth  (peter.spjuth@space.se)
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; see the file COPYING.  If not, write to
#  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
#  Boston, MA 02111-1307, USA.
#
#----------------------------------------------------------------------
# $Revision$
#----------------------------------------------------------------------

# Compare two lines and rate how much they resemble each other.
# This has never worked well. Some day I'll sit down, think this through,
# and come up with a better algorithm.
proc compareLines2 {line1 line2} {
    set opts $::Pref(ignore)
    if {$::Pref(nocase)} {lappend opts -nocase}
    set res [eval DiffUtil::diffStrings $opts \$line1 \$line2]

    # Collect identical pieces and different pieces
    set sames {}
    set diffs1 {}
    set diffs2 {}
    foreach {same1 same2 diff1 diff2} $res {
        lappend sames $same1
        if {$diff1 != ""} {
            lappend diffs1 $diff1
        }
        if {$diff2 != ""} {
            lappend diffs2 $diff2
        }
    }
    set sumsame 0
    set sumdiff1 0
    set sumdiff2 0
    foreach same $sames {
        set apa [string length [string trim $same]]
        incr sumsame [expr {$apa * $apa}]
    }
    foreach diff $diffs1 {
        set apa [string length $diff]
        incr sumdiff1 $apa
    }
    foreach diff $diffs2 {
        set apa [string length $diff]
        incr sumdiff2 $apa
    }
#    puts "Same ($sames)"
#    puts "D1   ($diffs1)"
#    puts "D2   ($diffs2)"
#    puts "S $sumsame D $sumdiff1 D $sumdiff2"
    return [expr {$sumsame - [maxAbs $sumdiff1 $sumdiff2]}]
}

# Initialise a multidimensional list with some value
# This should use lrepeat once 8.5 is required
# The args are in the same order as indexes to lset/lindex
proc Linit {elem args} {
    for {set t [expr {[llength $args] - 1}]} {$t >= 0} {incr t -1} {
	set new {}
	for {set j [lindex $args $t]} {$j >= 1} {incr j -1} {
	    lappend new $elem
	}
	set elem $new
    }
    return $elem
}

# Decide how to display change blocks
# This tries to match the lines that resemble each other and put them
# next to each other.
# As compareLines2, this would need a complete rework and a
# better algorithm.
proc compareBlocks {block1 block2} {
    set size1 [llength $block1]
    set size2 [llength $block2]

    # Swap if block1 is bigger
    if {$size1 > $size2} {
        set apa $block1
        set block1 $block2
        set block2 $apa
        set size1 [llength $block1]
        set size2 [llength $block2]
        set dsym a
        set asym d
    } else {
        set dsym d
        set asym a
    }

    # Collect statistics
    set scores [Linit {} $size1 $size2]
    set emptyResult [Linit {} $size1]
    set scoresbest $emptyResult
    set origresult $emptyResult

    set j 0
    set bestsum 0
    foreach line1 $block1 {
        set bestscore -100000
        set bestline 0
        set i 0
        foreach line2 $block2 {
            set x [compareLines2 $line1 $line2]
            lset scores $j $i $x
            #puts "Score $j $i : $x"
            if {$x > $bestscore} {
                set bestscore $x
                set bestline $i
            }
            incr i
        }
        #puts "Best for $j is $bestline : $bestscore"
        lset origresult $j $bestline
        lset scoresbest $j $bestscore
        incr bestsum $bestscore
        incr j
    }
    #puts "Bestsum: $bestsum"

    # origresult holds a mapping between blocks where each row
    # is paired with its best match. This may not be a possible
    # result since it has to be in order.

    #puts "Origresult: $origresult"

    # If the size is 1, it is automatically in order so we
    # don't need further processing.

    if {$size1 > 1} {
        # Start with a check if the theoretical best works, since often that
        # is the case.
        set order 1
        set result $origresult
        for {set i 0} {$i < ($size1 - 1)} {incr i} {
            if {[lindex $result $i] >= [lindex $result [expr {$i + 1}]]} {
                set order 0
                break
            }
        }
        #if {$order} {puts "ORDER"}
    }

    set bestresult $origresult
    set bestscoresum -100000

    if {$size1 > 1 && $order == 0} {
        # Look through the obvious "subblock" alternatives

        for {set startj 0} {$startj < ($size2 - $size1 + 1)} {incr startj} {
	    set sum 0
	    set result $emptyResult
	    for {set i 0 ; set j $startj} {$i < $size1} {incr i ; incr j} {
		lset result $i $j
		incr sum [lindex $scores $i $j]
	    }
	    #puts "Subblock $startj sum: $sum"
            if {$sum > $bestscoresum} {
                #puts "New best: $sum ($bestscoresum)"
                set bestresult $result
                set bestscoresum $sum
            }
	}

        # If we reach 75% if the theoretical best, we take it
        while {$bestscoresum < (3 * $bestsum / 4)} {
            #puts "Outer: $scoresbest"
	    # The outer loop restarts from the "best mapping"
	    set result $origresult
            set mark [Linit 0 $size1]
            set high $mark

            # If result is in order, no problem.
            # Otherwise, try to adjust result to make it ordered

            while {1} {
                #puts "Inner: $scoresbest"
		# The inner loop tries to get the result in order
                set besti 0
                set bestscore -100000
                set order 1
                for {set i 0} {$i < $size1} {incr i} {
                    if {[lindex $mark $i] == 0} {
                        for {set j [expr {$i + 1}]} {$j < $size1} {incr j} {
                            if {[lindex $mark $j] == 0} break
                        }
                        if {$j < $size1 && \
                                [lindex $result $i] >= [lindex $result $j]} {
                            set order 0
                        }
                        set x [lindex $scoresbest $i]
                        if {$x > $bestscore} {
                            set bestscore $x
                            set besti $i
                        }
                    }
                }
                #puts "Best $besti order $order sc $bestscore"
                if {$order} break
                lset mark $besti 1
                set bestr [lindex $result $besti]
                for {set i 0} {$i < $besti} {incr i} {
                    if {[lindex $mark $i] == 0 && \
                            [lindex $result $i] >= $bestr} {
                        lset mark $i 2
                    }
                }
                for {set i [expr {$besti + 1}]} {$i < $size1} {incr i} {
                    if {[lindex $mark $i] == 0 && \
                            [lindex $result $i] <= $bestr} {
                        lset mark $i 2
                    }
                }
            }

            set prev $size2
            for {set i [expr {$size1 - 1}]} {$i >= 0} {incr i -1} {
                if {[lindex $mark $i] != 2} {
                    set prev [lindex $result $i]
                } else {
                    lset high $i [expr {$prev - 1}]
                }
            }
            set prev -1
            for {set i 0} {$i < $size1} {incr i} {
                if {[lindex $mark $i] != 2} {
                    set prev [lindex $result $i]
                } else {
                    if {[lindex $high $i] > $prev} {
                        incr prev
                        lset result $i $prev
                    } else {
                        lset result $i -1
                    }
                }
            }
            set scoresum 0
            for {set i 0} {$i < $size1} {incr i} {
                set j [lindex $result $i]
                set sc [lindex $scores $i $j] ;# FIXA: can this fail?
                if {[string is integer -strict $sc]} {
                    #puts "Score: $i $j [lindex $scores $i $j]"
                    incr scoresum $sc
                }
            }
            #puts "Scoresum: $scoresum ($bestscoresum)"

	    # If it was not an improvement over previous iteration, quit
            if {$scoresum <= $bestscoresum} {
                break
	    }

	    set bestresult $result
	    set bestscoresum $scoresum

	    # We are redoing from start, but try to improve by
	    # ignoring the most awkwardly placed line.
	    set mostp -1
	    set mosti 0
	    for {set i 0} {$i < $size1} {incr i} {
		if {[lindex $mark $i] == 1} {
		    if {abs([lindex $result $i] - $i) > $mostp} {
			set mostp [expr {abs([lindex $result $i] - $i)}]
                        set mosti $i
		    }
		}
	    }
	    #puts "Most $mosti $mostp"
	    lset scoresbest $mosti 0
        }
    }

    set result $bestresult

    # Collect the result into diff-like codes to use as display info.

    set apa {}
    set t1 0
    set t2 0
    while {$t1 < $size1 || $t2 < $size2} {
        if {$t1 < $size1} {
            set r [lindex $result $t1]
            if {$r < $t2 || $t2 >= $size2} {
                lappend apa $dsym
                incr t1
            } elseif {$r == $t2} {
                #if {[string match Wm* [lindex $block2 $t2]]} {
                #    puts "Left : [lindex $block1 $t1]"
                #    puts "Right: [lindex $block2 $t2]"
                #    puts "Score: $scores($t1,$t2)"
                #}

                # If the score is too bad, don't do line parsing.
                if {[lindex $scores $t1 $t2] < 0} {
                    lappend apa "C"
                } else {
                    lappend apa "c"
                }
                incr t1
                incr t2
            } else {
                lappend apa $asym
                incr t2
            }
        } else {
            lappend apa $asym
            incr t2
        }
    }
    return $apa
}