/*

Test the cost of multiple threads addressing memory on the same cache line.

Copyright (C) Eric Day - http://oddments.org/
All content licensed under the Creative Commons Attribution 3.0 License.

If two or more threads try to address memory that reside on the same
cache line, extra overhead is incurred to keep those cache lines
accurate. It is good to know what your cache line sizes are so that
each thread in your application can be sure it is not sharing memory
with other threads. This program demonstrates the extra cost of using
adjacent memory in two or more threads. It tests up to 16 threads
and memory spacing up to 256 (which should be larger than most cache
line sizes). The program also runs the same test but with thread local
variables as a base line. The output is in CSV so it can be imported
into a spreadsheet for graphing.

> gcc -o cache_line cache_line.c -lpthread
> ./cache_line 
Shared cache-line
,1 Thread,2 Threads,4 Threads,8 Threads,16 Threads
1,22222,121974,671122,1188277,2550051
2,22214,137352,854652,1232071,1552620
4,22202,44438,881968,738385,1141329
8,22572,816220,88183,473661,422146
16,22201,22228,43720,81015,159115
32,22207,22227,43563,81128,190002
64,22203,34243,53038,91301,178124
128,22208,22206,65184,81307,184931
256,22199,44052,58758,86770,160736
Local cache-line
,1 Thread,2 Threads,4 Threads,8 Threads,16 Threads
1,17763,35481,47427,82711,142880
2,17752,33102,35510,65311,145929
4,17759,17760,47859,66232,143629
8,17755,17761,47854,65012,142636
16,17756,35494,33921,95427,141427
32,17767,34045,35507,71580,147418
64,17758,35501,33378,82944,141263
128,17768,34006,46820,65219,162269
256,17755,17763,37843,81372,138011

*/

#include <assert.h>
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>

#define MAX_THREADS 16
#define MAX_SPACING 256
#define ITERATIONS 10000000

pthread_mutex_t lock;
int counter[MAX_THREADS * MAX_SPACING];
int counter_index;
int spacing;

void _run_threads(void *(*function)(void*));
void *_counter_shared(void *data);
void *_counter_local(void *data);

int main(void)
{
  assert(pthread_mutex_init(&lock, NULL) == 0);

  printf("Shared cache-line\n");
  _run_threads(_counter_shared);

  printf("Local cache-line\n");
  _run_threads(_counter_local);

  assert(pthread_mutex_destroy(&lock) == 0);

  return 0;
} 

void _run_threads(void *(*function)(void*))
{
  int thread_count;
  int thread;
  pthread_t thread_id[MAX_THREADS];
  struct timeval start, end;
  unsigned long results[MAX_THREADS][MAX_SPACING];

  for (spacing= 1; spacing <= MAX_SPACING; spacing*= 2)
  {
    for (thread_count= 1; thread_count <= MAX_THREADS; thread_count*= 2)
    {
      memset(counter, 0, sizeof(int) * MAX_THREADS * MAX_SPACING);
      counter_index= 0;

      /* Lock here so threads don't start immediately. */
      assert(pthread_mutex_lock(&lock) == 0);

      for (thread = 0; thread < thread_count; thread++)
        pthread_create(&thread_id[thread], NULL, function, NULL);

      gettimeofday(&start, NULL);
      /* Unlock to start test now that all threads are created. */
      assert(pthread_mutex_unlock(&lock) == 0);

      for (thread = 0; thread < thread_count; thread++)
        pthread_join(thread_id[thread], NULL);

      gettimeofday(&end, NULL);
      results[thread_count - 1][spacing - 1]=
        ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);
    }
  }  

  /* Output results in CSV format. */
  for (thread_count= 1; thread_count <= MAX_THREADS; thread_count*= 2)
    printf(",%d %s", thread_count, thread_count == 1 ? "Thread" : "Threads");
  printf("\n");

  for (spacing= 1; spacing <= MAX_SPACING; spacing*= 2)
  {
    printf("%d", spacing);
    for (thread_count= 1; thread_count <= MAX_THREADS; thread_count*= 2)
      printf(",%ld", results[thread_count - 1][spacing - 1]);
    printf("\n");
  }
}

void *_counter_shared(void *data)
{
  int x;
  int y;
  (void) data;

  assert(pthread_mutex_lock(&lock) == 0);
  y= counter_index;
  counter_index+= spacing;
  assert(pthread_mutex_unlock(&lock) == 0);

  for (x= 0; x < ITERATIONS; x++)
    counter[y]++;

  return NULL;
}

void *_counter_local(void *data)
{
  int x;
  int y;
  int local_counter= 0;
  (void) data;

  assert(pthread_mutex_lock(&lock) == 0);
  y= counter_index;
  counter_index+= spacing;
  assert(pthread_mutex_unlock(&lock) == 0);

  for (x= 0; x < ITERATIONS; x++)
    local_counter++;

  counter[y]= local_counter;

  return NULL;
}
