Hi all,
I don’t quite understand how to use #pragma omp parallel
in OpenFHE. If multiple cores are available, does the library share the workload between them by default? Here is my concrete example.
I have the following comparison benchmarking program:
#include "openfhe.h"
#include "binfhecontext.h"
#include <vector>
#include <random>
#include <thread>
#include "openfhecore.h"
#include <pthread.h>
using namespace lbcrypto;
std::vector<double> GenerateRandomVector(size_t size, double minVal, double maxVal) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis(minVal, maxVal);
std::vector<double> randomVector;
randomVector.reserve(size);
for (size_t i = 0; i < size; ++i) {
double randomValue = dis(gen);
randomVector.push_back(randomValue);
}
return randomVector;
}
void ComparisonTest() {
TimeVar t;
double processingTime(0.0);
ScalingTechnique scTech = FIXEDAUTO;
uint32_t multDepth = 17;
if (scTech == FLEXIBLEAUTOEXT)
multDepth += 1;
uint32_t scaleModSize = 50;
uint32_t firstModSize = 60;
uint32_t ringDim = 65536;
SecurityLevel sl = HEStd_128_classic;
BINFHE_PARAMSET slBin = STD128;
uint32_t logQ_ccLWE = 25;
uint32_t slots = 16; // sparsely-packed
uint32_t batchSize = slots;
CCParams<CryptoContextCKKSRNS> parameters;
parameters.SetMultiplicativeDepth(multDepth);
parameters.SetScalingModSize(scaleModSize);
parameters.SetFirstModSize(firstModSize);
parameters.SetScalingTechnique(scTech);
parameters.SetSecurityLevel(sl);
parameters.SetRingDim(ringDim);
parameters.SetBatchSize(batchSize);
parameters.SetSecretKeyDist(UNIFORM_TERNARY);
parameters.SetKeySwitchTechnique(HYBRID);
parameters.SetNumLargeDigits(3);
CryptoContext<DCRTPoly> cc = GenCryptoContext(parameters);
// Enable the features that you wish to use
cc->Enable(PKE);
cc->Enable(KEYSWITCH);
cc->Enable(LEVELEDSHE);
cc->Enable(ADVANCEDSHE);
cc->Enable(SCHEMESWITCH);
// Generate encryption keys.
auto keys = cc->KeyGen();
// Step 2: Prepare the FHEW cryptocontext and keys for FHEW and scheme switching
auto FHEWparams = cc->EvalSchemeSwitchingSetup(sl, slBin, false, logQ_ccLWE, false, slots);
auto ccLWE = FHEWparams.first;
auto privateKeyFHEW = FHEWparams.second;
ccLWE.BTKeyGen(privateKeyFHEW);
cc->EvalSchemeSwitchingKeyGen(keys, privateKeyFHEW);
// Set the scaling factor to be able to decrypt; the LWE mod switch is performed on the ciphertext at the last level
auto modulus_LWE = 1 << logQ_ccLWE;
auto beta = ccLWE.GetBeta().ConvertToInt();
auto pLWE2 = modulus_LWE / (2 * beta); // Large precision
double scaleSignFHEW = 1.0;
const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
uint32_t init_level = 0;
if (cryptoParams->GetScalingTechnique() == FLEXIBLEAUTOEXT)
init_level = 1;
cc->EvalCompareSwitchPrecompute(pLWE2, init_level, scaleSignFHEW);
std::vector<double> x1 = GenerateRandomVector(slots, -8, 8);
Plaintext ptxt1 = cc->MakeCKKSPackedPlaintext(x1, 1, 0, nullptr, slots);
auto ciphertexts1 = cc->Encrypt(keys.publicKey, ptxt1);
std::vector<double> x2 = GenerateRandomVector(slots, -8, 8);
Plaintext ptxt2 = cc->MakeCKKSPackedPlaintext(x2, 1, 0, nullptr, slots);
auto ciphertexts2 = cc->Encrypt(keys.publicKey, ptxt2);
TIC(t);
auto ciphertextResult = cc->EvalCompareSchemeSwitching(ciphertexts1, ciphertexts2, slots, slots);
processingTime = TOC(t);
std::cout << "Comparison DONE in " << processingTime / 1000.0 << " (seconds)" << std::endl;
}
int main() {
ComparisonTest();
return 0;
}
When the program is running, here is the CPU load:
The program runs in 44 seconds.
I made another “parallel” version of my program following the example from here.
#include "openfhe.h"
#include "binfhecontext.h"
#include <vector>
#include <random>
#include <thread>
#include "openfhecore.h"
#include <pthread.h>
using namespace lbcrypto;
std::vector<double> GenerateRandomVector(size_t size, double minVal, double maxVal) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<double> dis(minVal, maxVal);
std::vector<double> randomVector;
randomVector.reserve(size);
for (size_t i = 0; i < size; ++i) {
double randomValue = dis(gen);
randomVector.push_back(randomValue);
}
return randomVector;
}
void ComparisonTest() {
TimeVar t;
double processingTime(0.0);
lbcrypto::OpenFHEParallelControls.Enable();
ScalingTechnique scTech = FIXEDAUTO;
uint32_t multDepth = 17;
if (scTech == FLEXIBLEAUTOEXT)
multDepth += 1;
uint32_t scaleModSize = 50;
uint32_t firstModSize = 60;
uint32_t ringDim = 65536;
SecurityLevel sl = HEStd_128_classic;
BINFHE_PARAMSET slBin = STD128;
uint32_t logQ_ccLWE = 25;
uint32_t slots = 16; // sparsely-packed
uint32_t batchSize = slots;
CCParams<CryptoContextCKKSRNS> parameters;
parameters.SetMultiplicativeDepth(multDepth);
parameters.SetScalingModSize(scaleModSize);
parameters.SetFirstModSize(firstModSize);
parameters.SetScalingTechnique(scTech);
parameters.SetSecurityLevel(sl);
parameters.SetRingDim(ringDim);
parameters.SetBatchSize(batchSize);
parameters.SetSecretKeyDist(UNIFORM_TERNARY);
parameters.SetKeySwitchTechnique(HYBRID);
parameters.SetNumLargeDigits(3);
CryptoContext<DCRTPoly> cc = GenCryptoContext(parameters);
// Enable the features that you wish to use
cc->Enable(PKE);
cc->Enable(KEYSWITCH);
cc->Enable(LEVELEDSHE);
cc->Enable(ADVANCEDSHE);
cc->Enable(SCHEMESWITCH);
// Generate encryption keys.
auto keys = cc->KeyGen();
// Step 2: Prepare the FHEW cryptocontext and keys for FHEW and scheme switching
auto FHEWparams = cc->EvalSchemeSwitchingSetup(sl, slBin, false, logQ_ccLWE, false, slots);
auto ccLWE = FHEWparams.first;
auto privateKeyFHEW = FHEWparams.second;
ccLWE.BTKeyGen(privateKeyFHEW);
cc->EvalSchemeSwitchingKeyGen(keys, privateKeyFHEW);
// Set the scaling factor to be able to decrypt; the LWE mod switch is performed on the ciphertext at the last level
auto modulus_LWE = 1 << logQ_ccLWE;
auto beta = ccLWE.GetBeta().ConvertToInt();
auto pLWE2 = modulus_LWE / (2 * beta); // Large precision
double scaleSignFHEW = 1.0;
const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
uint32_t init_level = 0;
if (cryptoParams->GetScalingTechnique() == FLEXIBLEAUTOEXT)
init_level = 1;
cc->EvalCompareSwitchPrecompute(pLWE2, init_level, scaleSignFHEW);
const auto numOfCores = std::thread::hardware_concurrency();
std::cout << "Distributed computation over " << numOfCores << " cores" << std::endl;
std::vector<Ciphertext<DCRTPoly>> ciphertexts1;
for(int core = 0; core < numOfCores; core++) {
std::vector<double> x = GenerateRandomVector(slots, -8, 8);
Plaintext ptxt = cc->MakeCKKSPackedPlaintext(x, 1, 0, nullptr, slots);
auto c = cc->Encrypt(keys.publicKey, ptxt);
ciphertexts1.push_back(c);
}
std::vector<Ciphertext<DCRTPoly>> ciphertexts2;
for(int core = 0; core < numOfCores; core++) {
std::vector<double> x = GenerateRandomVector(slots, -8, 8);
Plaintext ptxt = cc->MakeCKKSPackedPlaintext(x, 1, 0, nullptr, slots);
auto c = cc->Encrypt(keys.publicKey, ptxt);
ciphertexts2.push_back(c);
}
TIC(t);
#pragma omp parallel for
for(int core = 0; core < numOfCores; core++) {
auto cResult = cc->EvalCompareSchemeSwitching(ciphertexts1[core], ciphertexts2[core], slots, slots);
}
processingTime = TOC(t);
std::cout << "Comparison DONE in " << processingTime / 1000.0 << " (seconds)" << std::endl;
}
int main() {
ComparisonTest();
return 0;
}
From my understanding, the program should have a runtime similar to the first one because in one version, we compare two ciphertexts on a single core while in the second version, we compare 24 ciphertexts in distributed mode on 24 cores. However, the running time of this version is at least 8 times longer than the initial program (I stopped the execution after a while). The CPU load is also different. This time all 24 cores are at 100\% during the execution.
My target is to perform a benchmark on the comparison operation in a distributed environment. What am I missing?
Thank you!