测试硬件:CPU-i5-4590
命令行:/arch:AVX
优化项:/O2
main.cpp
#include <iostream> #include <vector> #include "method.h" #include <random> #include <time.h> using std::default_random_engine; using std::uniform_real_distribution; int main(int argc, char* argv[]) { //乘法累加运算 { int size = 33; float *input1 = (float *)malloc(sizeof(float) * size); float *input2 = (float *)malloc(sizeof(float) * size); default_random_engine e; uniform_real_distribution<float> u(0, 1); //随机数分布对象 for (int i = 0; i < size; i++) { input1[i] = u(e); input2[i] = u(e); } int cntLoop = 10000000; clock_t start_t = clock(); float org = 0.0; for (int i = 0; i < cntLoop; i++) org = MathMulAdd(input1, input2, size); printf("org = %f\t", org); printf("cost time: %d(ms)\n", clock() - start_t); start_t = clock(); float sse = 0.0; for (int i = 0; i < cntLoop; i++) sse = SSEMulAdd(input1, input2, size); printf("sse = %f\t", sse); printf("cost time: %d(ms)\n", clock() - start_t); start_t = clock(); float sse_ = 0.0; for (int i = 0; i < cntLoop; i++) sse_ = SSEFmAdd(input1, input2, size); printf("sse_= %f\t", sse_); printf("cost time: %d(ms)\n", clock() - start_t); start_t = clock(); float avx = 0.0; for (int i = 0; i < cntLoop; i++) avx = AVXMulAdd(input1, input2, size); printf("avx = %f\t", avx); printf("cost time: %d(ms)\n", clock() - start_t); start_t = clock(); float avx_ = 0.0; for (int i = 0; i < cntLoop; i++) avx_ = AVXFmAdd(input1, input2, size); printf("avx_= %f\t", avx_); printf("cost time: %d(ms)\n", clock() - start_t); free(input1); free(input2); } //结果: //org = 11.216135 cost time : 174(ms) //sse = 11.216136 cost time : 102(ms) //sse_ = 11.216136 cost time : 119(ms) //avx = 11.216136 cost time : 63(ms) //avx_ = 11.216136 cost time : 61(ms) //加法运算 //{ // int size = 27; // float *input = (float *)malloc(sizeof(float) * size); // for (int i = 0; i < size; i++) // input[i] = 0.0025; // int cntLoop = 300000000; // clock_t start_t = clock(); // float org = 0.0; // for (int i = 0; i < cntLoop; i++) // org = MathSum(input, size); // printf("org = %f\t", org); // printf("cost time: %d\n", clock() - start_t); // start_t = clock(); // float sse = 0.0; // for (int i = 0; i < cntLoop; i++) // sse = SSESum(input, size); // printf("sse = %f\t", sse); // printf("cost time: %d\n", clock() - start_t); // start_t = clock(); // float avx = 0.0; // for (int i = 0; i < cntLoop; i++) // avx = AVXSum(input, size); // printf("avx = %f\t", avx); // printf("cost time: %d\n", clock() - start_t); // free(input); //} //结果: //org = 0.067500 cost time : 3062 //sse = 0.067500 cost time : 2283 //avx = 0.067500 cost time : 1829 //最大值/最小值运算 //{ // int size = 58; // float *input = (float *)malloc(sizeof(float) * size); // default_random_engine e; // uniform_real_distribution<float> u(0, 3); //随机数分布对象 // for (int i = 0; i < size; i++) // { // input[i] = u(e); // printf("%f ", input[i]); // if ((i + 1) % 8 == 0) // printf("\n"); // } // printf("\n"); // int cntLoop = 100000000; // clock_t start_t = clock(); // float org; // for (int i = 0; i < cntLoop; i++) // org = MathMax(input, size); // printf("org = %f\t", org); // printf("cost time: %d(ms)\n", clock() - start_t); // start_t = clock(); // float sse; // for (int i = 0; i < cntLoop; i++) // sse = SSEMax(input, size); // printf("sse = %f\t", sse); // printf("cost time: %d(ms)\n", clock() - start_t); // start_t = clock(); // float avx; // for (int i = 0; i < cntLoop; i++) // avx = AVXMax(input, size); // printf("avx = %f\t", avx); // printf("cost time: %d(ms)\n", clock() - start_t); // free(input); //} //结果: //org = 2.989384 cost time : 9491(ms) //sse = 2.989384 cost time : 1261(ms) //avx = 2.989384 cost time : 1413(ms) return 0; }
method.h
#pragma once #include <intrin.h> #include <stdio.h> float MathMulAdd(const float *input1, const float *input2, int size); float SSEMulAdd(const float *input1, const float *input2, int size); float SSEFmAdd(const float *input1, const float *input2, int size); float AVXMulAdd(const float *input1, const float *input2, int size); float AVXFmAdd(const float *input1, const float *input2, int size); float MathSum(const float *input, int size); float SSESum(const float *input, int size); float AVXSum(const float *input, int size); float MathMax(const float *input, int size); float SSEMax(const float *input, int size); float AVXMax(const float *input, int size);
method.cpp
#include "method.h" float MathMulAdd(const float *input1, const float *input2, int size) { float output = 0.0; for (int i = 0; i < size; i++) { output += input1[i] * input2[i]; } return output; } float SSEMulAdd(const float *input1, const float *input2, int size) { if (input1 == nullptr || input2 == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 4; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m128 loadData1, loadData2; __m128 mulData = _mm_setzero_ps(); __m128 sumData = _mm_setzero_ps(); const float *p1 = input1; const float *p2 = input2; for (int i = 0; i < cntBlock; i++) { loadData1 = _mm_load_ps(p1); loadData2 = _mm_load_ps(p2); mulData = _mm_mul_ps(loadData1, loadData2); sumData = _mm_add_ps(sumData, mulData); p1 += nBlockWidth; p2 += nBlockWidth; } sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ... sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ... output += sumData.m128_f32[(0)]; // 前4组 for (int i = 0; i < cntRem; i++) { output += p1[i] * p2[i]; } return output; } float SSEFmAdd(const float *input1, const float *input2, int size) { if (input1 == nullptr || input2 == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 4; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m128 loadData1, loadData2; __m128 sumData = _mm_setzero_ps(); const float *p1 = input1; const float *p2 = input2; for (int i = 0; i < cntBlock; i++) { loadData1 = _mm_load_ps(p1); loadData2 = _mm_load_ps(p2); sumData = _mm_fmadd_ps(loadData1, loadData2, sumData); p1 += nBlockWidth; p2 += nBlockWidth; } sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ... sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ... output += sumData.m128_f32[(0)]; // 前4组 for (int i = 0; i < cntRem; i++) { output += p1[i] * p2[i]; } return output; } float AVXMulAdd(const float *input1, const float *input2, int size) { if (input1 == nullptr || input2 == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 8; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m256 loadData1, loadData2; __m256 mulData = _mm256_setzero_ps(); __m256 sumData = _mm256_setzero_ps(); const float *p1 = input1; const float *p2 = input2; for (int i = 0; i < cntBlock; i++) { loadData1 = _mm256_load_ps(p1); loadData2 = _mm256_load_ps(p2); mulData = _mm256_mul_ps(loadData1, loadData2); sumData = _mm256_add_ps(sumData, mulData); p1 += nBlockWidth; p2 += nBlockWidth; } sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... output += sumData.m256_f32[(0)]; // 前4组 output += sumData.m256_f32[(4)]; // 后4组 for (int i = 0; i < cntRem; i++) { output += p1[i] * p2[i]; } return output; } float AVXFmAdd(const float *input1, const float *input2, int size) { if (input1 == nullptr || input2 == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 8; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m256 loadData1, loadData2; __m256 sumData = _mm256_setzero_ps(); const float *p1 = input1; const float *p2 = input2; for (int i = 0; i < cntBlock; i++) { loadData1 = _mm256_load_ps(p1); loadData2 = _mm256_load_ps(p2); sumData = _mm256_fmadd_ps(loadData1, loadData2, sumData); p1 += nBlockWidth; p2 += nBlockWidth; } sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... output += sumData.m256_f32[(0)]; // 前4组 output += sumData.m256_f32[(4)]; // 后4组 for (int i = 0; i < cntRem; i++) { output += p1[i] * p2[i]; } return output; } float MathSum(const float *input, int size) { float output = 0.0; for (int i = 0; i < size; i++) { output += input[i]; } return output; } float SSESum(const float *input, int size) { if (input == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 4; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m128 loadData; __m128 sumData = _mm_setzero_ps(); const float *p = input; for (int i = 0; i < cntBlock; i++) { loadData = _mm_load_ps(p); sumData = _mm_add_ps(sumData, loadData); p += nBlockWidth; } sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ... sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ... output += sumData.m128_f32[(0)]; // 前4组 for (int i = 0; i < cntRem; i++) { output += p[i]; } return output; } float AVXSum(const float *input, int size) { if (input == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 8; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; float output = 0; __m256 loadData; __m256 sumData = _mm256_setzero_ps(); const float *p = input; for (int i = 0; i < cntBlock; i++) { loadData = _mm256_load_ps(p); sumData = _mm256_add_ps(sumData, loadData); p += nBlockWidth; } sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... output += sumData.m256_f32[(0)]; // 前4组 output += sumData.m256_f32[(4)]; // 后4组 for (int i = 0; i < cntRem; i++) { output += p[i]; } return output; } float MathMax(const float *input, int size) { float maxVal = input[0]; for (int i = 1; i < size; i++) { maxVal = maxVal > input[i] ? maxVal : input[i]; } return maxVal; } float SSEMax(const float *input, int size) { if (input == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 4; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; __declspec(align(16)) float output[4]; __m128 loadData; const float *p = input; __m128 maxVal = _mm_load_ps(p); p += nBlockWidth; for (int i = 1; i < cntBlock; i++) { loadData = _mm_load_ps(p); maxVal = _mm_max_ps(maxVal, loadData); p += nBlockWidth; } _mm_store_ps(output, maxVal); float maxVal_ = output[0]; for (int i = 1; i < 4; i++) { maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i]; } for (int i = 0; i < cntRem; i++) { maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i]; } return maxVal_; } float AVXMax(const float *input, int size) { if (input == nullptr) { printf("input data is null\n"); return -1; } int nBlockWidth = 8; int cntBlock = size / nBlockWidth; int cntRem = size % nBlockWidth; __declspec(align(32)) float output[8]; __m256 loadData; const float *p = input; __m256 maxVal = _mm256_load_ps(p); p += nBlockWidth; for (int i = 1; i < cntBlock; i++) { loadData = _mm256_load_ps(p); maxVal = _mm256_max_ps(maxVal, loadData); p += nBlockWidth; } _mm256_store_ps(output, maxVal); float maxVal_ = output[0]; for (int i = 1; i < 8; i++) { maxVal_ = maxVal_ > output[i] ? maxVal_ : output[i]; } for (int i = 0; i < cntRem; i++) { maxVal_ = maxVal_ > p[i] ? maxVal_ : p[i]; } return maxVal_; }