Olaf, here is my class:
Option Explicit
'based on Ratcliff-Implementation with additional PreProcessing
'Author: Olaf Schmidt (2008)
'modified by Eduardo (2011)
Private Type SAFEARRAY1D
cDims As Integer
fFeatures As Integer
cbElements As Long
cLocks As Long
pvData As Long
cElements As Long
lLbound As Long
End Type
Private Declare Sub BindArray Lib "kernel32" Alias "RtlMoveMemory" _
(PArr() As Any, PSrc&, Optional ByVal cb& = 4)
Private Declare Sub ReleaseArray Lib "kernel32" Alias "RtlMoveMemory" _
(PArr() As Any, Optional PSrc& = 0, Optional ByVal cb& = 4)
Private Declare Function CharLowerBuffA Lib "user32" _
(lpsz As Any, ByVal cchLength&) As Long
Private Declare Function CharLowerBuffW Lib "user32" _
(lpsz As Any, ByVal cchLength&) As Long
Private aLowChars%(&H8000 To &H7FFF)
Private Arr1() As Integer, S1Arr() As Integer, saS1Arr As SAFEARRAY1D
Private Arr2() As Integer, S2Arr() As Integer, saS2Arr As SAFEARRAY1D
Private mDeletedChars1() As Integer, S3Arr() As Integer, saS3Arr As
SAFEARRAY1D
Private mDeletedChars2() As Integer, S4Arr() As Integer, saS4Arr As
SAFEARRAY1D
Private Sub Class_Initialize()
saS1Arr.cDims = 1: saS1Arr.cbElements = 2 '2 Bytes per Element
BindArray S1Arr, VarPtr(saS1Arr)
saS2Arr.cDims = 1: saS2Arr.cbElements = 2 '2 Bytes per Element
BindArray S2Arr, VarPtr(saS2Arr)
saS3Arr.cDims = 1: saS3Arr.cbElements = 2 '2 Bytes per Element
BindArray S3Arr, VarPtr(saS3Arr)
saS4Arr.cDims = 1: saS4Arr.cbElements = 2 '2 Bytes per Element
BindArray S4Arr, VarPtr(saS4Arr)
ReDim Arr1(8192): ReDim Arr2(8192) 'preinitialize our LowerBufArrays
ReDim mDeletedChars1(8192): ReDim mDeletedChars2(8192) 'preinitialize
InitLowCharLUT
End Sub
Private Sub Class_Terminate()
ReleaseArray S1Arr 'resets S1Arr into its original, "virginal" state
ReleaseArray S2Arr 'resets S1Arr into its original, "virginal" state
ReleaseArray S3Arr 'resets S1Arr into its original, "virginal" state
ReleaseArray S4Arr 'resets S1Arr into its original, "virginal" state
End Sub
Public Function WordsSimilarity(S1 As String, S2 As String) As Long
Dim i As Long, l1 As Long, l2 As Long, D As Single
Dim iLonger As Long
Dim iEqualChars As Long
Dim iCoincidence As Single
Dim iDiferences As Long
Dim c As Long
Dim iIndexScanFromEnd As Long
Dim iLastDeletedEqualFromBegining As Long
Dim iNumberOfDeleted1 As Long
Dim iNumberOfDeleted2 As Long
l1 = Len(S1): l2 = Len(S2)
If l1 = 0 Or l2 = 0 Then Exit Function
'pretest for exact similarity
If l1 = l2 Then
If S1 = S2 Then WordsSimilarity = 100: Exit Function
End If
'make sure, we have enough space in our CompareBuffers
If UBound(Arr1) < l1 + 1 Then ReDim Preserve Arr1(l1 + 1)
If UBound(Arr2) < l2 + 1 Then ReDim Preserve Arr2(l2 + 1)
If UBound(mDeletedChars1) < l1 + 1 Then ReDim Preserve
mDeletedChars1(l1 + 1)
If UBound(mDeletedChars2) < l2 + 1 Then ReDim Preserve
mDeletedChars2(l2 + 1)
'prepare for fast preprocessing (map S1 and S2 to Integer-Arrays)
saS1Arr.pvData = StrPtr(S1): saS1Arr.cElements = l1 + 1
saS2Arr.pvData = StrPtr(S2): saS2Arr.cElements = l2 + 1
saS3Arr.cElements = l1 + 1
saS4Arr.cElements = l2 + 1
'preprocess the stringcontent into the real Buffers Arr1 and Arr2
' now we get the chars that have been deleted in the preprocessing into
' two arrays, mDeletedChars1 and mDeletedChars2
D = PreProcessing(S1Arr, Arr1, l1, mDeletedChars1, iNumberOfDeleted1)
D = D + PreProcessing(S2Arr, Arr2, l2, mDeletedChars2,
iNumberOfDeleted2)
' test if the deleted chars were coincident (or at least some)
' for the two words, if so, then add the percentage again
iLastDeletedEqualFromBegining = 0
For c = 1 To iNumberOfDeleted1
If c > iNumberOfDeleted2 Then Exit For
If mDeletedChars1(c) = mDeletedChars2(c) Then
D = D - 0.04
iLastDeletedEqualFromBegining = c
End If
Next c
iIndexScanFromEnd = iNumberOfDeleted2 + 1
For c = iNumberOfDeleted1 To 1 Step -1
iIndexScanFromEnd = iIndexScanFromEnd - 1
If iIndexScanFromEnd < 1 Then Exit For
If c <= iLastDeletedEqualFromBegining Then Exit For
If iIndexScanFromEnd <= iLastDeletedEqualFromBegining Then Exit For
If mDeletedChars1(c) = mDeletedChars2(iIndexScanFromEnd) Then
D = D - 0.04
End If
Next c
If D > 0.12 Then D = 0.12 'limit the Replacement-Reduction
' (the replacement reduction limit was increased)
'reset the mapping
saS1Arr.pvData = 0: saS1Arr.cElements = 0
saS2Arr.pvData = 0: saS2Arr.cElements = 0
saS3Arr.pvData = 0: saS3Arr.cElements = 0
saS4Arr.pvData = 0: saS4Arr.cElements = 0
' the similarity is based on Levenstein Distance
If l1 > l2 Then
iLonger = l1
Else
iLonger = l2
End If
iDiferences = LevDist03(Arr1, Arr2, l1, l2)
If iDiferences = 0 Then
WordsSimilarity = 100 - D * 100
Exit Function
End If
iEqualChars = iLonger - iDiferences
If iEqualChars = 0 Then
WordsSimilarity = 0
Exit Function
End If
iCoincidence = iEqualChars / iLonger
WordsSimilarity = (iCoincidence - D) * 100
If WordsSimilarity < 0 Then WordsSimilarity = 0
End Function
'preprocessing, optimized for german-language
Private Function PreProcessing(ByRef Src() As Integer, ByRef Dst() _
As Integer, L As Long, ByRef nDeletedChars() As Integer, _
nNumberOfDeleted As Long) As Single
Dim i As Long, J As Long, LChar As Integer, ReplCount As Long
Dim StartIdx As Long
Dim n As Long
Dim nc As Long
Dim ce As Long
Dim c As Long
'replace a leading 'h' with nothing and weight it with 2
If aLowChars(Src(0)) = 104 Then StartIdx = 1: ReplCount = 2
For i = StartIdx To L - 1
LChar = aLowChars(Src(i))
'each char-doubling of the *same* char is reduced to one char
If LChar = aLowChars(Src(i + 1)) Then
Dst(J) = LChar: J = J + 1
i = i + 1 'skip one source-char
ReplCount = ReplCount + 1
nNumberOfDeleted = nNumberOfDeleted + 1
nDeletedChars(nNumberOfDeleted) = LChar
GoTo Continue
End If
' If there are several consecutive consonants, only the first
is left
' (may be too drastic, leaving two consecutive consonants may be
' also good or even better, it's something to modify and test)
' Also here the accents are not filtered (they would be treated as
' consonants), that's because I'm working with words already
filtered
If IsConsonant(LChar) Then
n = 1
If i + n <= L - 1 Then
Do Until Not IsConsonant(aLowChars(Src(i + n)))
n = n + 1
If i + n + 1 > UBound(Src) Then Exit Do
Loop
End If
n = n - 1
If n > 0 Then
Dst(J) = LChar: J = J + 1
ReplCount = ReplCount + n
c = 1
For nc = nNumberOfDeleted + 1 To nNumberOfDeleted + n
nDeletedChars(nc) = aLowChars(Src(i + c))
c = c + 1
Next nc
nNumberOfDeleted = nNumberOfDeleted + n
i = i + n 'skip source-chars
GoTo Continue
End If
End If
ReplCount = ReplCount + 1
Dst(J) = LChar: J = J + 1: ReplCount = ReplCount - 1
Continue: Next i
Dst(J) = 0 'set terminating NullChar
L = J 'reflect the eventually reduced CharCount in L
PreProcessing = ReplCount * 0.02 '2 percent reduction per replace
End Function
Private Sub InitLowCharLUT()
Dim J&
'inits a lookup-table for fast (unicode-aware) Lower-Lookups
For J = -32768 To 32767: aLowChars(J) = J: Next J
If CharLowerBuffW(aLowChars(-32768), &H10000) = 0 Then
CharLowerBuffA aLowChars(65), (223 - 65) * 2
End If
' patch the stooges
' S 138/352, s 154/353 | O 140/338, o 156/339 | Z 142/381, z 158/382 |
' Y 159/376, ÿ 255/255
aLowChars(138) = 154: aLowChars(352) = 353
aLowChars(140) = 156: aLowChars(338) = 339
aLowChars(142) = 158: aLowChars(381) = 382
aLowChars(159) = 255: aLowChars(376) = 255
End Sub
Private Function IsConsonant(nChar As Integer) As Boolean
Select Case nChar
Case 97, 101, 105, 111, 117
IsConsonant = False
Case Else
IsConsonant = True
End Select
End Function
Public Function LevDist03(ByRef b1() As Integer, ByRef b2() _
As Integer, l1 As Long, l2 As Long) As Long
' by Donald Lessau (VB newsgroup), 20041129
'
' this is a VB implementation of the Levenstein
' Distance function which ranks words by their
' similarity
Dim D() As Long ' matrix
Dim i As Long ' iterates through String1
Dim J As Long ' iterates through String2
Dim lCost As Long ' lCost
' Step 1
If l1 = 0 Then
LevDist03 = l2
Exit Function
End If
If l2 = 0 Then
LevDist03 = l1
Exit Function
End If
' Step 2: fill matrix
ReDim D(0 To l1, 0 To l2) As Long
For i = 0 To l1
D(i, 0) = i
Next
For J = 0 To l2
D(0, J) = J
Next
' Step 3
For i = 1 To l1
For J = 1 To l2
If b1(i - 1) = b2(J - 1) Then
lCost = 0
Else
lCost = 1
End If
D(i, J) = MinThree01(D(i - 1, J) + 1, _
D(i, J - 1) + 1, D(i - 1, J - 1) + lCost)
Next
Next
LevDist03 = D(l1, l2)
End Function
Private Function MinThree01(ByVal l1&, ByVal l2&, _
ByVal l3&) As Long
' by Donald, 20011116
If l1 < l2 Then
If l3 < l1 Then MinThree01 = l3 Else MinThree01 = l1
Else
If l2 < l3 Then MinThree01 = l2 Else MinThree01 = l3
End If
End Function