GPU Histogram 20 XP Medium

Ctrl+Enter Run Ctrl+S Save

📊 Chapter 7, Part 3: GPU Histogram — Counting Votes at Scale

💡 Story: Imagine a giant box with 10 million colored balls, and you want to count how many balls are red, green, blue, etc. On a CPU, you pick up balls one at a time: 10 million operations. On a GPU, 10 million soldiers each pick up one ball and shout their color simultaneously. All red-counters update at the same time — but wait, if two soldiers both try to increment the 'red' counter, you get a race condition! The fix? Local tallying then merging.

// ❌ Naive: All threads hammering the SAME global bins
__global__ void naiveHistogram(int* data, int* hist, int n) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < n)
        atomicAdd(&hist[data[i]], 1);  // Works, but ATOMIC CONTENTION!
    // With 1M threads all hitting the 256-bin histogram,
    // many threads compete for the same bin → serialized → slow!
}

// ✅ Optimized: Per-block private histogram, then atomic merge
__global__ void fastHistogram(int* data, int* hist, int n, int numBins) {
    extern __shared__ int localHist[];  // Per-block histogram in shared memory!
    int tid = threadIdx.x;
    int gid = threadIdx.x + blockIdx.x * blockDim.x;
    
    // Step 1: Initialize local histogram to zero
    for (int i = tid; i < numBins; i += blockDim.x)
        localHist[i] = 0;
    __syncthreads();
    
    // Step 2: Each thread increments LOCAL shared-memory histogram
    // (Atomic, but only within 1 block → much less contention)
    if (gid < n)
        atomicAdd(&localHist[data[gid]], 1);
    __syncthreads();
    
    // Step 3: Atomically merge local histogram into global histogram
    for (int i = tid; i < numBins; i += blockDim.x)
        atomicAdd(&hist[i], localHist[i]);
    // Now global atomics only called numBins times per block (not n times!)
}
// Launch: fastHistogram<<<grid, block, numBins*sizeof(int)>>>(data, hist, n, 256);

🎯 Key idea — Let each block build a private histogram in shared memory
⚡ Phase 1 — Per-block atomic increments (low contention — only 256 threads per block compete)
🔗 Phase 2 — One final atomic merge per bin per block into global memory
📉 Contention reduction — From n atomic ops per bin → just numBlocks ops per bin
🖼️ Use case — Image processing, frequency analysis, data binning, AI feature extraction

📋 Instructions

Build a histogram of the values [2,0,1,2,3,1,0,2,3,3,0,1] into 4 bins, showing the two-phase approach: ``` === GPU Histogram (Two-Phase) === Data: [2, 0, 1, 2, 3, 1, 0, 2, 3, 3, 0, 1] Bins: 4 (values 0,1,2,3) --- Phase 1: Build local (per-simulated-block) histograms --- Block 0 processes [2,0,1,2,3,1] -> local: [1,2,2,1] Block 1 processes [0,2,3,3,0,1] -> local: [2,1,1,2] --- Phase 2: Merge into global histogram --- Global histogram: Bin 0: 3 Bin 1: 3 Bin 2: 3 Bin 3: 3 Verification: 3+3+3+3 = 12 (matches input size) ```

Run the code as-is. Notice that 'local' histograms reduce atomic contention: with 256 bins and 256 threads per block, there's ~1 collision per bin vs potentially thousands if all threads hit the global histogram directly.

← Previous Next Exercise →

main.py

Hi! I'm Rex 👋

#include <stdio.h>
#include <string.h>

int main() {
    int data[] = {2,0,1,2,3,1,0,2,3,3,0,1};
    int n = 12, numBins = 4, blockSize = 6;
    int numBlocks = n / blockSize;
    int localHist[2][4], globalHist[4];
    memset(globalHist, 0, sizeof(globalHist));
    
    printf("=== GPU Histogram (Two-Phase) ===\n");
    printf("Data: [2, 0, 1, 2, 3, 1, 0, 2, 3, 3, 0, 1]\n");
    printf("Bins: %d (values 0,1,2,3)\n\n", numBins);
    printf("--- Phase 1: Build local (per-simulated-block) histograms ---\n");
    
    for (int b = 0; b < numBlocks; b++) {
        memset(localHist[b], 0, numBins * sizeof(int));
        for (int i = b * blockSize; i < (b+1) * blockSize; i++)
            localHist[b][data[i]]++;
        printf("Block %d processes [", b);
        for (int i = b*blockSize; i < (b+1)*blockSize; i++) {
            printf("%d", data[i]);
            if (i < (b+1)*blockSize - 1) printf(",");
        }
        printf("] -> local: [%d,%d,%d,%d]\n",
               localHist[b][0], localHist[b][1], localHist[b][2], localHist[b][3]);
    }
    
    printf("\n--- Phase 2: Merge into global histogram ---\n");
    for (int b = 0; b < numBlocks; b++)
        for (int bin = 0; bin < numBins; bin++)
            globalHist[bin] += localHist[b][bin];
    
    printf("Global histogram:\n");
    int total = 0;
    for (int bin = 0; bin < numBins; bin++) {
        printf("  Bin %d: %d\n", bin, globalHist[bin]);
        total += globalHist[bin];
    }
    printf("\nVerification: %d+%d+%d+%d = %d (matches input size)\n",
           globalHist[0], globalHist[1], globalHist[2], globalHist[3], total);
    return 0;
}

Output

Ready. Press ▶ Run or Ctrl+Enter.

›