159 lines
3.9 KiB
Go
159 lines
3.9 KiB
Go
package city
|
|
|
|
import "encoding/binary"
|
|
|
|
// A subroutine for CH128(). Returns a decent 128-bit hash for strings
|
|
// of any length representable in signed long. Based on City and Mumur.
|
|
func chMurmur(s []byte, seed U128) U128 {
|
|
length := len(s)
|
|
a := seed.Low
|
|
b := seed.High
|
|
c := uint64(0)
|
|
d := uint64(0)
|
|
l := length - 16
|
|
if len(s) <= 16 { // length <= 16
|
|
a = shiftMix(a*k1) * k1
|
|
c = b*k1 + ch0to16(s, length)
|
|
|
|
if length >= 8 {
|
|
d = shiftMix(a + binary.LittleEndian.Uint64(s))
|
|
} else {
|
|
d = shiftMix(a + c)
|
|
}
|
|
} else { // length > 16
|
|
c = ch16(binary.LittleEndian.Uint64(s[length-8:])+k1, a)
|
|
d = ch16(b+uint64(length), c+binary.LittleEndian.Uint64(s[length-16:]))
|
|
a += d
|
|
|
|
{
|
|
a ^= shiftMix(binary.LittleEndian.Uint64(s[0:8:8])*k1) * k1
|
|
a *= k1
|
|
b ^= a
|
|
c ^= shiftMix(binary.LittleEndian.Uint64(s[8:8+8:8+8])*k1) * k1
|
|
c *= k1
|
|
d ^= c
|
|
s = s[16:]
|
|
l -= 16
|
|
}
|
|
|
|
if l > 0 {
|
|
for len(s) >= 16 {
|
|
a ^= shiftMix(binary.LittleEndian.Uint64(s[0:8:8])*k1) * k1
|
|
a *= k1
|
|
b ^= a
|
|
c ^= shiftMix(binary.LittleEndian.Uint64(s[8:8+8:8+8])*k1) * k1
|
|
c *= k1
|
|
d ^= c
|
|
s = s[16:]
|
|
l -= 16
|
|
|
|
if l <= 0 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
a = ch16(a, c)
|
|
b = ch16(d, b)
|
|
return U128{a ^ b, ch16(b, a)}
|
|
}
|
|
|
|
// CH128 returns 128-bit ClickHouse CityHash.
|
|
func CH128(s []byte) U128 {
|
|
if len(s) >= 16 {
|
|
return CH128Seed(s[16:], U128{
|
|
Low: binary.LittleEndian.Uint64(s[0:8:8]) ^ k3,
|
|
High: binary.LittleEndian.Uint64(s[8 : 8+8 : 8+8]),
|
|
})
|
|
}
|
|
if len(s) >= 8 {
|
|
l := uint64(len(s))
|
|
return CH128Seed(nil, U128{
|
|
Low: binary.LittleEndian.Uint64(s) ^ (l * k0),
|
|
High: binary.LittleEndian.Uint64(s[l-8:]) ^ k1,
|
|
})
|
|
}
|
|
return CH128Seed(s, U128{Low: k0, High: k1})
|
|
}
|
|
|
|
// CH128Seed returns 128-bit seeded ClickHouse CityHash.
|
|
func CH128Seed(s []byte, seed U128) U128 {
|
|
if len(s) < 128 {
|
|
return chMurmur(s, seed)
|
|
}
|
|
|
|
// Saving initial input for tail hashing.
|
|
t := s
|
|
|
|
// We expect len >= 128 to be the common case. Keep 56 bytes of state:
|
|
// v, w, x, y and z.
|
|
var v, w U128
|
|
x := seed.Low
|
|
y := seed.High
|
|
z := uint64(len(s)) * k1
|
|
|
|
{
|
|
subSlice := (*[96]byte)(s[0:])
|
|
v.Low = rot64(y^k1, 49)*k1 + binary.LittleEndian.Uint64(subSlice[0:])
|
|
v.High = rot64(v.Low, 42)*k1 + binary.LittleEndian.Uint64(subSlice[8:])
|
|
w.Low = rot64(y+z, 35)*k1 + x
|
|
w.High = rot64(x+binary.LittleEndian.Uint64(subSlice[88:]), 53) * k1
|
|
}
|
|
|
|
// This is the same inner loop as CH64(), manually unrolled.
|
|
for len(s) >= 128 {
|
|
// Roll 1.
|
|
{
|
|
x = rot64(x+y+v.Low+binary.LittleEndian.Uint64(s[16:16+8:16+8]), 37) * k1
|
|
y = rot64(y+v.High+binary.LittleEndian.Uint64(s[48:48+8:48+8]), 42) * k1
|
|
|
|
x ^= w.High
|
|
y ^= v.Low
|
|
|
|
z = rot64(z^w.Low, 33)
|
|
v = weakHash32SeedsByte(s, v.High*k1, x+w.Low)
|
|
w = weakHash32SeedsByte(s[32:], z+w.High, y)
|
|
z, x = x, z
|
|
}
|
|
|
|
// Roll 2.
|
|
{
|
|
const offset = 64
|
|
x = rot64(x+y+v.Low+binary.LittleEndian.Uint64(s[offset+16:offset+16+8:offset+16+8]), 37) * k1
|
|
y = rot64(y+v.High+binary.LittleEndian.Uint64(s[offset+48:offset+48+8:offset+48+8]), 42) * k1
|
|
x ^= w.High
|
|
y ^= v.Low
|
|
|
|
z = rot64(z^w.Low, 33)
|
|
v = weakHash32SeedsByte(s[offset:], v.High*k1, x+w.Low)
|
|
w = weakHash32SeedsByte(s[offset+32:], z+w.High, y)
|
|
z, x = x, z
|
|
}
|
|
s = s[128:]
|
|
}
|
|
|
|
y += rot64(w.Low, 37)*k0 + z
|
|
x += rot64(v.Low+z, 49) * k0
|
|
|
|
// If 0 < length < 128, hash up to 4 chunks of 32 bytes each from the end of s.
|
|
for i := 0; i < len(s); {
|
|
i += 32
|
|
y = rot64(y-x, 42)*k0 + v.High
|
|
w.Low += binary.LittleEndian.Uint64(t[len(t)-i+16:])
|
|
x = rot64(x, 49)*k0 + w.Low
|
|
w.Low += v.Low
|
|
v = weakHash32SeedsByte(t[len(t)-i:], v.Low, v.High)
|
|
}
|
|
|
|
// At this point our 48 bytes of state should contain more than
|
|
// enough information for a strong 128-bit hash. We use two
|
|
// different 48-byte-to-8-byte hashes to get a 16-byte final result.
|
|
x = ch16(x, v.Low)
|
|
y = ch16(y, w.Low)
|
|
|
|
return U128{
|
|
Low: ch16(x+v.High, w.High) + y,
|
|
High: ch16(x+w.High, y+v.High),
|
|
}
|
|
}
|