C  INT8_MATMUL — symmetric per-tensor int8 x int8 matrix multiply
C  with int32 accumulation. Output is int32 (no requantisation).
C
C  Reference: B. Jacob et al., "Quantization and Training of Neural
C  Networks for Efficient Integer-Arithmetic-Only Inference",
C  CVPR 2018 (arXiv:1712.05877). Symmetric formulation with both
C  input zero-points fixed at zero.
C
C  Hand-written reference for the Dark Factory's Phase 3
C  inference-kernel ladder, 2026-05-23. Public domain.
C
C  Shapes (column-major):
C    A is M x K (INTEGER*1)
C    B is K x N (INTEGER*1)
C    C is M x N (INTEGER, 32-bit accumulator)
C
C  Overflow note: each output element is a sum of K signed
C  int8 x int8 products. Worst case |product| = 127 * 127 = 16129,
C  so a 32-bit accumulator is safe for K up to 133164.

      SUBROUTINE INT8_MATMUL (M, N, K, A, LDA, B, LDB, C, LDC)
C  Inputs:
C    M, N, K  — matrix dimensions (positive)
C    A        — M x K int8 matrix, leading dimension LDA (>= M)
C    B        — K x N int8 matrix, leading dimension LDB (>= K)
C  Output:
C    C        — M x N int32 matrix, leading dimension LDC (>= M)
      INTEGER M, N, K, LDA, LDB, LDC
      INTEGER*1 A(LDA, *), B(LDB, *)
      INTEGER C(LDC, *)
      INTEGER I, J, L, ACC
      DO 30 J = 1, N
         DO 20 I = 1, M
            ACC = 0
            DO 10 L = 1, K
               ACC = ACC + A(I, L) * B(L, J)
   10       CONTINUE
            C(I, J) = ACC
   20    CONTINUE
   30 CONTINUE
      RETURN
      END