Return Styles: Pseud0ch, Terminal, Valhalla, NES, Geocities, Blue Moon. Entire thread

Post top tier code

Name: Anonymous 2014-07-22 13:29

Can be yours or not, I just want to see something good.

Name: Anonymous 2014-08-09 4:04

Fuerst Hash.

Faster than xxhash, MumurHash, SpookyHash, SBox, CityHash, and all FNV variants. Even faster than Intel's CRC32 instructions. It also has better properties and less collisions than all of the above, due to the 256-bit accumulation window. None of the plebs know about it, and probably never will. They prefer to use shit.

I hand rolled this for x86. x86-64 version was easier and cleaner, due to the larger register count and native 64-bit x 64-bit -> 128-bit integer multiplies.

#
# uint64_t memhash(void const* key, size_t n, uint64_t seed);
#
# Based off of the fast byteswap hash function by Steven Fuerst.
# http://locklessinc.com/articles/fast_hash/
#

.text
.align 16, 0x90

factor128:
.octa 0xd6c573e9c613993d5a379ab38dc5a46b

mix1:
.octa 0x1591aefa5e7e5a171591aefa5e7e5a17

mix2:
.octa 0x2bb6863566c4e7612bb6863566c4e761

.global memhash
.type memhash, @function

memhash:
movl 4(%esp), %ecx # key
movl 8(%esp), %edx # n
movq 12(%esp), %xmm2 # seed
pushl %edi
movd %edx, %xmm1
pushl %esi
movdqa (factor128), %xmm3
movdqa (mix1), %xmm6
movdqa (mix2), %xmm7
cmpl $32, %edx
jb 2f
mov %edx, %eax
shr $5, %eax
testb $15, %cl
jne 5f

# Aligned main loop, process 32-byte aligned chunks at a time

.align 8, 0x90
1: paddq (%ecx), %xmm1
paddq 16(%ecx), %xmm2
pmullw %xmm3, %xmm1
pmullw %xmm3, %xmm2
add $32, %ecx
movdqa %xmm1, %xmm0
punpckhbw %xmm2, %xmm1
punpcklbw %xmm0, %xmm2
dec %eax
jne 1b
pxor %xmm2, %xmm1

# Check for remaining chunk, otherwise extract 128 bits state and proceed to final mix

2: testb $31, %dl
jne 3f
movhlps %xmm1, %xmm2
jmp 4f

# Hash remaining chunk, up to 31-bytes in size

3: testb $16, %dl
je 3f
movdqu (%ecx), %xmm0
add $16, %ecx
pxor %xmm0, %xmm1
pmullw %xmm3, %xmm1
3: pxor %xmm3, %xmm3
movhlps %xmm1, %xmm2
testb $8, %dl
je 3f
movq (%ecx), %xmm0
add $8, %ecx
pxor %xmm0, %xmm1
3: testb $4, %dl
je 3f
movd (%ecx), %xmm3
add $4, %ecx
3: testb $2, %dl
je 3f
movzwl (%ecx), %eax
psllq $16, %xmm3
movd %eax, %xmm0
add $2, %ecx
paddq %xmm0, %xmm3
3: testb $1, %dl
je 3f
movzbl (%ecx), %eax
psllq $8, %xmm3
movd %eax, %xmm0
paddq %xmm0, %xmm3
3: pxor %xmm3, %xmm2

# Use 3x 128bit multiply and xorshift for final mix

4: # Iteration #0

pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

# Iteration #1

pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

# Iteration #2

pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

# Place 64-bit result in %edx:%eax and return

popl %esi
movd %xmm1, %eax
pshufd $0b11100001, %xmm1, %xmm1
popl %edi
movd %xmm1, %edx
ret

# Unaligned main-loop, process 32-byte unaligned chunks at a time

.align 16, 0x90
5: movdqu (%ecx), %xmm0
add $32, %ecx
paddq %xmm0, %xmm1
movdqu -16(%ecx), %xmm0
pmullw %xmm3, %xmm1
paddq %xmm0, %xmm2
pmullw %xmm3, %xmm2
movdqa %xmm1, %xmm0
punpckhbw %xmm2, %xmm1
punpcklbw %xmm0, %xmm2
dec %eax
jne 5b
pxor %xmm2, %xmm1
jmp 2b

.size memhash, .-memhash


Also, check em.

Newer Posts
Don't change these.
Name: Email:
Entire Thread Thread List