Using SSE/SSE2 for optimization

최적화 방법 중 하나에 대한 초보자용 소개입니다.

Objective

본 게시물의 목적은 간단한 최적화 방법을 소개하는 것입니다. 나중에 더 많은 글을 투고할 계획입니다.

Introduction

본 게시물은 Intel의 SIMD (Single Instruction Multiple Data) extension 기술에 대해 설명합니다. movdqa같이 기존의 방식보다 더 빠르게 데이터를 전송할 수 있는, Intel의 새로운 instruction을 사용하여 최적화합니다.

Recall

시작하기 전에, 우리가 이미 알고 있는 지식들을 상기해봅시다. 요즘, 또는 일반적으로, 우리는 집에서, 또한 산업에서 32비트 프로세서를 사용하고 있습니다. eax, ebx, .. 와 같은 범용 목적의 레지스터들은 32비트입니다. 그리고 sizeof(int) = 4 (bytes) 입니다. 그러나 모든 레지스터들이 32비트인 것은 아니며, 더 많은 비트 길이를 가지는 레지스터들이 있습니다. 10년 전부터, Intel은 64비트를 가지는 8개의 레지스터 mm0, mm1 .. mm7이 있는 MMX extension을 도입했습니다. 그 이후, Intel은 128비트를 가지는 다른 새로운 레지스터 xmm0, xmm1 .. xmm7이 있는 SSE extension을 도입했습니다.

Requirement

먼저 자신이 어떤 기기를 사용하고 있는지 알아보세요. Intel P3 이상이어야 합니다. 이 최적화 방법은 기기에 의존적이라는 것을 명심해야 합니다. 이는 자신의 하드웨어가 지원하지 않는다면, 차이를 볼 수 없다는 것을 의미합니다.

Code

제가 만든 샘플입니다. 콘솔 모드에서 실행될 수 있도록 일부러 간단하게 만들었습니다. 복사 붙여넣기 하지 않고, 독자가 이해하며 스스로 시도해보는 것을 권장합니다.

데모 코드는 같은 목적으로 제공되는 두 함수의 차이점을 확인시켜 줄 것입니다. 여기서부터 더 많은 설명을 하지 않을 것입니다. 스스로 코드를 보며 코드 내의 주석을 읽어주세요. 이해할 수 있을 것이라 확신합니다. =)

잠깐! 먼저 브레이크 포인트를 준비하고 인내심을 가지고 기다리세요. 디버깅을 할 때, 다음 두 함수를 모두 step해보면 차이점을 알 수 있을 것입니다.

DataTransferTypical은 한 루프에 하나의 int를 복사할 것이고 (sizeof(int) = 4 bytes), 반면 DataTransferOptimised는 한 루프에 한 루프에 4개의 int를 복사할 것입니다 (4 * sizeof(int) = 16 bytes).

Watch 창을 준비하세요. Watch창에서 piDst, 101을 watch하세요. 그 뒤 어떻게 변하는지 확인하세요.

P.S: 이 코드를 MSVC++로 컴파일하기 위해 processor pack을 설치해야 합니다.

  
int DataTransferTypical(int* piDst, int* piSrc, unsigned long SizeInBytes);
int DataTransferOptimised(int* piDst, int* piSrc, unsigned long SizeInBytes);

  
// stdafx.h : include file for standard system include files,
//  or project specific include files that are used frequently, but
//      are changed infrequently
//

#if !defined(AFX_STDAFX_H__CCA0779D_0F4D_4A56_A148_738F06155CA8__INCLUDED_)
#define AFX_STDAFX_H__CCA0779D_0F4D_4A56_A148_738F06155CA8__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#define WIN32_LEAN_AND_MEAN		// Exclude rarely-used stuff from Windows headers

#include <stdio.h>

// TODO: reference additional headers your program requires here

//
// Microsoft Visual C++ will insert additional declarations immediately before the previous line.

#endif // !defined(AFX_STDAFX_H__CCA0779D_0F4D_4A56_A148_738F06155CA8__INCLUDED_)

  
#include "stdafx.h"

#include <conio.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>

#include "malloc.h"

#define ITERATION 10
#define DATA_SIZE 0x00100000

int DataTransferTypical(int* piDst, int* piSrc, unsigned long SizeInBytes);
int DataTransferOptimised(int* piDst, int* piSrc, unsigned long SizeInBytes);

int main(int argc, char* argv[])
{
	// 시작 시간과 종료 시간을 기억합니다.
	// 만약 더 정확한 것을 원한다면, 다른 게시물을 찾아보세요.
	unsigned long dwTimeStart = 0;
	unsigned long dwTimeEnd = 0;

	// 임시 변수입니다.
	int* piSrc = nullptr;
	int* piDst = nullptr;

	int i = 0;
	char cKey = 0;

	unsigned long dwDataSizeInBytes = sizeof(int) * DATA_SIZE;

	// 이 코드를 msvc++으로 컴파일하기 위해 processor pack을 설치해야 합니다.

	piSrc = (int*)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);
	piDst = (int*)_aligned_malloc(dwDataSizeInBytes, dwDataSizeInBytes);

	do
	{
		// 초기화 합니다.
		memset(piSrc, 1, dwDataSizeInBytes);
		memset(piDst, 0, dwDataSizeInBytes);

		dwTimeStart = clock();
		for (i = 0; i < ITERATION; i++)
			DataTransferTypical(piDst, piSrc, dwDataSizeInBytes);
		dwTimeEnd = clock();
		printf("== Typical Transfer of %d * %d times of %d bytes data ==\\n Time Elapsed = ""% d msec\\n\\n",
		       ITERATION, DATA_SIZE, sizeof(int), dwTimeEnd - dwTimeStart);

		// 초기화 합니다.
		memset(piSrc, 1, dwDataSizeInBytes);
		memset(piDst, 0, dwDataSizeInBytes);

		dwTimeStart = clock();
		for (i = 0; i < ITERATION; i++)
			DataTransferOptimised(piDst, piSrc, dwDataSizeInBytes);
		dwTimeEnd = clock();
		printf("== Optimised Transfer of %d * %d times of %d bytes data ==\\n Time Elapsed = % d msec\\n\\n",
		       ITERATION, DATA_SIZE, sizeof(int), dwTimeEnd - dwTimeStart);

		printf("Rerun? (y/n) ");
		cKey = getche();
		printf("\\n\\n");
	}
	while (cKey == 'y');

	_aligned_free(piSrc);
	_aligned_free(piDst);

	return 0;
}

#pragma warning(push)
#pragma warning(disable:4018 4102)

int DataTransferTypical(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
	unsigned long dwNumElements = SizeInBytes / sizeof(int);

	for (int i = 0; i < dwNumElements; i++)
	{
		// i는 offset입니다.
		*(piDst + i) = *(piSrc + i);
	}

	return 0;
}

int DataTransferOptimised(int* piDst, int* piSrc, unsigned long SizeInBytes)
{
	unsigned long dwNumElements = SizeInBytes / sizeof(int);

	// 실제로 사용하지는 않으며 디버깅용 변수입니다.
	// 반복 횟수를 가집니다.
	// 또한 packed 데이터의 개수를 의미하기도 합니다.
	unsigned long dwNumPacks = dwNumElements / (128 / (sizeof(int) * 8));

	_asm
		{
		// cleanup
		pusha;

		begin:
		// 카운터를 SizeInBytes으로 초기화합니다.
		mov ecx, SizeInBytes;
		// destination 포인터를 가져옵니다.
		mov edi, piDst;
		// source 포인터를 가져옵니다.
		mov esi, piSrc;

		begina:
		// 카운터가 0이라면, 반복을 종료합니다.
		cmp ecx, 0;
		jz end;

		body:
		// offset을 계산합니다.
		mov ebx, SizeInBytes;
		sub ebx, ecx;
		// source의 내용을 128비트 레지스터에 복사합니다.
		movdqa xmm1, [esi + ebx];
		// 128비트 레지스터를 destination에 복사합니다.
		movdqa [edi + ebx], xmm1;

		bodya:
		// 우리는 이미 1 packed == 4 * sizeof(int)를 수행했습니다.
		sub ecx, 16;
		jmp begina;

		end:
		// cleanup
		popa;
		}

	return 0;
}

#pragma warning(pop)

Finally

이것은 Code Project에서의 제 첫 게시물입니다. 만약 올바르지 않은 것이 있다면 저에게 알려주세요. 또한, 여기에 올려놓은 데모가 초보자들에게 간단히 도움되었으면 합니다. 배움은 즐겁습니다. 그렇죠? =)