/prog/ - Post top tier code

Name: Anonymous 2014-08-09 4:04

Fuerst Hash.

Faster than xxhash, MumurHash, SpookyHash, SBox, CityHash, and all FNV variants. Even faster than Intel's CRC32 instructions. It also has better properties and less collisions than all of the above, due to the 256-bit accumulation window. None of the plebs know about it, and probably never will. They prefer to use shit.

I hand rolled this for x86. x86-64 version was easier and cleaner, due to the larger register count and native 64-bit x 64-bit -> 128-bit integer multiplies.
#
# uint64_t memhash(void const* key, size_t n, uint64_t seed);
#
# Based off of the fast byteswap hash function by Steven Fuerst.
# http://locklessinc.com/articles/fast_hash/ 
#

.text
.align 16, 0x90

factor128:
.octa 0xd6c573e9c613993d5a379ab38dc5a46b

mix1:
.octa 0x1591aefa5e7e5a171591aefa5e7e5a17

mix2:
.octa 0x2bb6863566c4e7612bb6863566c4e761

.global memhash
.type memhash, @function

memhash:
    movl        4(%esp), %ecx                   # key
    movl        8(%esp), %edx                   # n
    movq        12(%esp), %xmm2                 # seed
    pushl       %edi
    movd        %edx, %xmm1
    pushl       %esi
    movdqa      (factor128), %xmm3
    movdqa      (mix1), %xmm6
    movdqa      (mix2), %xmm7
    cmpl        $32, %edx
    jb          2f
    mov         %edx, %eax
    shr         $5, %eax
    testb       $15, %cl
    jne         5f

    # Aligned main loop, process 32-byte aligned chunks at a time

.align 8, 0x90
1:  paddq       (%ecx), %xmm1
    paddq       16(%ecx), %xmm2
    pmullw      %xmm3, %xmm1
    pmullw      %xmm3, %xmm2
    add         $32, %ecx
    movdqa      %xmm1, %xmm0
    punpckhbw   %xmm2, %xmm1
    punpcklbw   %xmm0, %xmm2
    dec         %eax
    jne         1b
    pxor        %xmm2, %xmm1

    # Check for remaining chunk, otherwise extract 128 bits state and proceed to final mix

2:  testb       $31, %dl
    jne         3f
    movhlps     %xmm1, %xmm2
    jmp         4f

    # Hash remaining chunk, up to 31-bytes in size

3:  testb       $16, %dl
    je          3f
    movdqu      (%ecx), %xmm0
    add         $16, %ecx
    pxor        %xmm0, %xmm1
    pmullw      %xmm3, %xmm1
3:  pxor        %xmm3, %xmm3
    movhlps     %xmm1, %xmm2
    testb       $8, %dl
    je          3f
    movq        (%ecx), %xmm0
    add         $8, %ecx
    pxor        %xmm0, %xmm1
3:  testb       $4, %dl
    je          3f
    movd        (%ecx), %xmm3
    add         $4, %ecx
3:  testb       $2, %dl
    je          3f
    movzwl      (%ecx), %eax
    psllq       $16, %xmm3
    movd        %eax, %xmm0
    add         $2, %ecx
    paddq       %xmm0, %xmm3
3:  testb       $1, %dl
    je          3f
    movzbl      (%ecx), %eax
    psllq       $8, %xmm3
    movd        %eax, %xmm0
    paddq       %xmm0, %xmm3
3:  pxor        %xmm3, %xmm2

    # Use 3x 128bit multiply and xorshift for final mix

4:  # Iteration #0

    pshufd      $0b10110001, %xmm6, %xmm3
    punpckldq   %xmm1, %xmm1
    pmuludq     %xmm1, %xmm3                    # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
    punpckldq   %xmm2, %xmm2
    pmuludq     %xmm6, %xmm1                    # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
    movd        %xmm3, %ecx
    pshufd      $0b11100001, %xmm1, %xmm1
    pshufd      $0b11100001, %xmm3, %xmm3
    movd        %xmm1, %edx
    movd        %xmm3, %esi
    pshufd      $0b11010010, %xmm1, %xmm1
    pshufd      $0b01110010, %xmm3, %xmm3
    movd        %xmm1, %eax
    xor         %edi, %edi
    pshufd      $0b00100111, %xmm1, %xmm1
    add         %eax, %edx                      # X = B + C
    movd        %xmm1, %eax
    adc         $0, %eax                        # Y = D + carry
    adc         $0, %edi                        # Z = carry
    add         %ecx, %edx                      # X = X + E
    punpckhdq   %xmm1, %xmm1                    # [0|0|0|A]
    movd        %edx, %xmm0                     # [0|0|0|X]
    adc         %esi, %eax                      # Y = Y + F + carry
    movd        %xmm3, %edx
    punpckhdq   %xmm3, %xmm3
    adc         $0, %edi                        # Z = Z + carry
    movd        %xmm3, %esi
    add         %edx, %eax                      # Y = Y + G
    pshufd      $0b10101111, %xmm7, %xmm3
    adc         %esi, %edi                      # Z = Z + H + carry
    movd        %eax, %xmm5                     # [0|0|0|Y]
    movd        %edi, %xmm4                     # [0|0|0|Z]
    punpckldq   %xmm0, %xmm1                    # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
    punpckldq   %xmm4, %xmm5                    # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
    pmuludq     %xmm2, %xmm3                    # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
    pxor        %xmm0, %xmm0
    pmuludq     %xmm7, %xmm2                    # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
    punpckldq   %xmm3, %xmm0                    # [e|0|e|0]
    pxor        %xmm4, %xmm4
    paddd       %xmm0, %xmm2                    # [d+e|c|b+e|a]
    punpckhdq   %xmm3, %xmm4                    # [g|0|g|0]
    paddd       %xmm4, %xmm2                    # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
    paddq       %xmm5, %xmm2                    # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
    pxor        %xmm2, %xmm1                    # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

    # Iteration #1

    pshufd      $0b10110001, %xmm6, %xmm3
    punpckldq   %xmm1, %xmm1
    pmuludq     %xmm1, %xmm3                    # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
    punpckldq   %xmm2, %xmm2
    pmuludq     %xmm6, %xmm1                    # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
    movd        %xmm3, %ecx
    pshufd      $0b11100001, %xmm1, %xmm1
    pshufd      $0b11100001, %xmm3, %xmm3
    movd        %xmm1, %edx
    movd        %xmm3, %esi
    pshufd      $0b11010010, %xmm1, %xmm1
    pshufd      $0b01110010, %xmm3, %xmm3
    movd        %xmm1, %eax
    xor         %edi, %edi
    pshufd      $0b00100111, %xmm1, %xmm1
    add         %eax, %edx                      # X = B + C
    movd        %xmm1, %eax
    adc         $0, %eax                        # Y = D + carry
    adc         $0, %edi                        # Z = carry
    add         %ecx, %edx                      # X = X + E
    punpckhdq   %xmm1, %xmm1                    # [0|0|0|A]
    movd        %edx, %xmm0                     # [0|0|0|X]
    adc         %esi, %eax                      # Y = Y + F + carry
    movd        %xmm3, %edx
    punpckhdq   %xmm3, %xmm3
    adc         $0, %edi                        # Z = Z + carry
    movd        %xmm3, %esi
    add         %edx, %eax                      # Y = Y + G
    pshufd      $0b10101111, %xmm7, %xmm3
    adc         %esi, %edi                      # Z = Z + H + carry
    movd        %eax, %xmm5                     # [0|0|0|Y]
    movd        %edi, %xmm4                     # [0|0|0|Z]
    punpckldq   %xmm0, %xmm1                    # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
    punpckldq   %xmm4, %xmm5                    # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
    pmuludq     %xmm2, %xmm3                    # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
    pxor        %xmm0, %xmm0
    pmuludq     %xmm7, %xmm2                    # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
    punpckldq   %xmm3, %xmm0                    # [e|0|e|0]
    pxor        %xmm4, %xmm4
    paddd       %xmm0, %xmm2                    # [d+e|c|b+e|a]
    punpckhdq   %xmm3, %xmm4                    # [g|0|g|0]
    paddd       %xmm4, %xmm2                    # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
    paddq       %xmm5, %xmm2                    # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
    pxor        %xmm2, %xmm1                    # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

    # Iteration #2

    pshufd      $0b10110001, %xmm6, %xmm3
    punpckldq   %xmm1, %xmm1
    pmuludq     %xmm1, %xmm3                    # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
    punpckldq   %xmm2, %xmm2
    pmuludq     %xmm6, %xmm1                    # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
    movd        %xmm3, %ecx
    pshufd      $0b11100001, %xmm1, %xmm1
    pshufd      $0b11100001, %xmm3, %xmm3
    movd        %xmm1, %edx
    movd        %xmm3, %esi
    pshufd      $0b11010010, %xmm1, %xmm1
    pshufd      $0b01110010, %xmm3, %xmm3
    movd        %xmm1, %eax
    xor         %edi, %edi
    pshufd      $0b00100111, %xmm1, %xmm1
    add         %eax, %edx                      # X = B + C
    movd        %xmm1, %eax
    adc         $0, %eax                        # Y = D + carry
    adc         $0, %edi                        # Z = carry
    add         %ecx, %edx                      # X = X + E
    punpckhdq   %xmm1, %xmm1                    # [0|0|0|A]
    movd        %edx, %xmm0                     # [0|0|0|X]
    adc         %esi, %eax                      # Y = Y + F + carry
    movd        %xmm3, %edx
    punpckhdq   %xmm3, %xmm3
    adc         $0, %edi                        # Z = Z + carry
    movd        %xmm3, %esi
    add         %edx, %eax                      # Y = Y + G
    pshufd      $0b10101111, %xmm7, %xmm3
    adc         %esi, %edi                      # Z = Z + H + carry
    movd        %eax, %xmm5                     # [0|0|0|Y]
    movd        %edi, %xmm4                     # [0|0|0|Z]
    punpckldq   %xmm0, %xmm1                    # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
    punpckldq   %xmm4, %xmm5                    # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
    pmuludq     %xmm2, %xmm3                    # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
    pxor        %xmm0, %xmm0
    pmuludq     %xmm7, %xmm2                    # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
    punpckldq   %xmm3, %xmm0                    # [e|0|e|0]
    pxor        %xmm4, %xmm4
    paddd       %xmm0, %xmm2                    # [d+e|c|b+e|a]
    punpckhdq   %xmm3, %xmm4                    # [g|0|g|0]
    paddd       %xmm4, %xmm2                    # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
    paddq       %xmm5, %xmm2                    # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
    pxor        %xmm2, %xmm1                    # hash1 = hash2 ^ lower_64bits(hash1 * mix1)

    # Place 64-bit result in %edx:%eax and return

    popl        %esi
    movd        %xmm1, %eax
    pshufd      $0b11100001, %xmm1, %xmm1
    popl        %edi
    movd        %xmm1, %edx
    ret

    # Unaligned main-loop, process 32-byte unaligned chunks at a time

.align 16, 0x90
5:  movdqu      (%ecx), %xmm0
    add         $32, %ecx
    paddq       %xmm0, %xmm1
    movdqu      -16(%ecx), %xmm0
    pmullw      %xmm3, %xmm1
    paddq       %xmm0, %xmm2
    pmullw      %xmm3, %xmm2
    movdqa      %xmm1, %xmm0
    punpckhbw   %xmm2, %xmm1
    punpcklbw   %xmm0, %xmm2
    dec         %eax
    jne         5b
    pxor        %xmm2, %xmm1
    jmp         2b

.size memhash, .-memhash
Also, check em.
Post top tier code

1 Name: Anonymous 2014-07-22 13:29

66 Name: Anonymous 2014-08-09 4:04

Name: Anonymous 2014-07-22 13:29

Name: Anonymous 2014-08-09 4:04