Matmul operation does not always give correct result

Depending on how the CTArray is initialized, the result of matrix multiplication may be incorrect (does not match the result from numpy matmul). Is this by design, or is this a bug?

Batch size: 16384
Starting matrix A:
[[1 2]
 [3 4]]
Decrypted result of A @ A:
Mode 'zero':
[[ 7.00000000e+00  8.00000000e+00]
 [ 1.20000000e+01 -3.38833803e-13]]
Mode 'tile':
[[ 7. 10.]
 [15. 22.]]
Mode 'tile' with batch_size = 4:
[[ 7.00000000e+00  8.00000000e+00]
 [ 1.20000000e+01 -5.02535914e-13]]
Mode 'tile' with batch_size = 16:
[[ 7. 10.]
 [15. 22.]]
Expected result:
[[ 7 10]
 [15 22]]

I have included the code to reproduce my result here:

import numpy as np
from openfhe import *
import openfhe_numpy as onp

# Initialize CKKS context
params = CCParamsCKKSRNS()
params.SetMultiplicativeDepth(5)
params.SetScalingModSize(59)
params.SetFirstModSize(60)
params.SetScalingTechnique(FIXEDAUTO)
params.SetSecretKeyDist(UNIFORM_TERNARY)

# Initialize FHE Context
cc = GenCryptoContext(params)
cc.Enable(PKESchemeFeature.PKE)
cc.Enable(PKESchemeFeature.LEVELEDSHE)
cc.Enable(PKESchemeFeature.ADVANCEDSHE)

# Generate keys
keys = cc.KeyGen()
cc.EvalMultKeyGen(keys.secretKey)
cc.EvalSumKeyGen(keys.secretKey)

ring_dim = cc.GetRingDimension()
batch_size = ring_dim // 2
print("Batch size:", batch_size)

# Create matrix and encrypt it
A = np.array([[1, 2], [3, 4]])
print("Starting matrix A:")
print(A)

# Encrypt with OpenFHE-NumPy
# Encrypt mode 'zero'
ctm_Az = onp.array(
      cc=cc,
      data=A,
      batch_size=batch_size,
      order=onp.ROW_MAJOR,
      fhe_type="C",
      mode="zero",
      public_key=keys.publicKey,
   )

# Encrypt mode 'tile'
ctm_At = onp.array(
      cc=cc,
      data=A,
      batch_size=batch_size,
      order=onp.ROW_MAJOR,
      fhe_type="C",
      mode="tile",
      public_key=keys.publicKey,
   )

# Encrypt mode 'tile' with batch_size = A.size = 4
ctm_Atb = onp.array(
      cc=cc,
      data=A,
      batch_size=A.size,
      order=onp.ROW_MAJOR,
      fhe_type="C",
      mode="tile",
      public_key=keys.publicKey,
   )

# Encrypt mode 'tile' with batch_size ** 2 = A.size ** 2 = 16
ctm_Atb2 = onp.array(
      cc=cc,
      data=A,
      batch_size=A.size ** 2,
      order=onp.ROW_MAJOR,
      fhe_type="C",
      mode="tile",
      public_key=keys.publicKey,
   )

# Generate keys
assert ctm_Az.ncols == ctm_At.ncols
assert ctm_At.ncols == ctm_Atb.ncols
onp.EvalSquareMatMultRotateKeyGen(keys.secretKey, ctm_Az.ncols)

# Perform encrypted operations
product_Az = ctm_Az @ ctm_Az
product_At = ctm_At @ ctm_At
product_Atb = ctm_Atb @ ctm_Atb
product_Atb2 = ctm_Atb2 @ ctm_Atb2

# Decrypt results
decrypted_Az = product_Az.decrypt(keys.secretKey, unpack_type="original")
decrypted_At = product_At.decrypt(keys.secretKey, unpack_type="original")
decrypted_Atb = product_Atb.decrypt(keys.secretKey, unpack_type="original")
decrypted_Atb2 = product_Atb2.decrypt(keys.secretKey, unpack_type="original")

print("Decrypted result of A @ A:")

# Incorrect result
print("Mode 'zero':")
print(decrypted_Az)

# Correct result
print("Mode 'tile':")
print(decrypted_At)

# Incorrect result
print("Mode 'tile' with batch_size = 4:")
print(decrypted_Atb)

# Correct result
print("Mode 'tile' with batch_size = 16:")
print(decrypted_Atb2)

# Expected result from numpy
print("Expected result:")
print(A @ A)