Semantic Testing with Vector Embeddings
February 5, 2026 · 5 min read
Traditional assertions check exact values: assert response == "expected". This breaks when LLMs rephrase correct answers, when localization changes wording, or when acceptable output has multiple valid forms. Vector embeddings enable semantic assertions: "this response means the same thing as the expected answer."
What Are Embeddings?
An embedding is a numerical vector representation of text where similar meanings produce vectors that are close together in high-dimensional space. The similarity between two texts can be measured with cosine similarity:
import numpy as np
from openai import OpenAI
openai_client = OpenAI()
def embed(text: str) -> list[float]:
"""Get embedding vector using OpenAI's embedding model."""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
a = np.array(vec1)
b = np.array(vec2)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
# Example
text1 = "Playwright supports Chromium, Firefox, and WebKit browsers."
text2 = "You can use Playwright to test across Chromium, Firefox, and WebKit."
text3 = "Python is a programming language used for data science."
sim_12 = cosine_similarity(embed(text1), embed(text2)) # ~0.93 — semantically similar
sim_13 = cosine_similarity(embed(text1), embed(text3)) # ~0.45 — semantically different
print(f"Similarity 1-2: {sim_12:.2f}") # 0.93
print(f"Similarity 1-3: {sim_13:.2f}") # 0.45Semantic Assertions in pytest
import pytest
import numpy as np
SEMANTIC_THRESHOLD = 0.85 # Cosine similarity required to "pass"
def assert_semantically_equal(actual: str, expected: str, threshold: float = SEMANTIC_THRESHOLD):
"""Assert that actual and expected have similar meaning."""
sim = cosine_similarity(embed(actual), embed(expected))
assert sim >= threshold, (
f"Semantic similarity {sim:.2f} below threshold {threshold}\n"
f"Expected meaning: {expected[:100]}...\n"
f"Actual: {actual[:100]}..."
)
# Use in tests
def test_summary_captures_correct_meaning():
article = "Docker BuildKit enables parallel build stages and cache mounts for faster builds."
summarizer = ArticleSummarizer()
summary = summarizer.summarize(article, max_words=20)
assert_semantically_equal(
actual=summary,
expected="BuildKit makes Docker builds faster with parallel stages and caching.",
threshold=0.80
)
def test_error_message_conveys_correct_information():
from myapp.api import handle_validation_error
error_response = handle_validation_error("email", "invalid format")
expected_meaning = "The email field has an invalid format"
assert_semantically_equal(error_response["message"], expected_meaning, threshold=0.85)Semantic Test Deduplication
Large test suites accumulate duplicate test cases with slightly different wording. Find and deduplicate semantically:
from dataclasses import dataclass
import json
@dataclass
class TestCase:
id: str
title: str
description: str
def find_duplicate_tests(test_cases: list[TestCase], threshold: float = 0.92) -> list[tuple]:
"""Find pairs of tests that are semantically too similar."""
texts = [f"{tc.title}: {tc.description}" for tc in test_cases]
embeddings = [embed(t) for t in texts]
duplicates = []
for i in range(len(test_cases)):
for j in range(i + 1, len(test_cases)):
sim = cosine_similarity(embeddings[i], embeddings[j])
if sim >= threshold:
duplicates.append((test_cases[i], test_cases[j], sim))
return sorted(duplicates, key=lambda x: x[2], reverse=True)
# Load test cases from JSON
with open("test_cases.json") as f:
raw = json.load(f)
cases = [TestCase(**tc) for tc in raw]
duplicates = find_duplicate_tests(cases, threshold=0.90)
for tc1, tc2, similarity in duplicates:
print(f"Potential duplicate ({similarity:.2f}):")
print(f" - {tc1.id}: {tc1.title}")
print(f" - {tc2.id}: {tc2.title}")RAG Retrieval Quality Testing
Test that your vector search retrieves the most relevant documents:
import chromadb
client = chromadb.Client()
collection = client.get_collection("documentation")
def test_retrieval_precision(query: str, relevant_doc_ids: list[str], top_k: int = 5) -> float:
"""Measure what fraction of top-k results are actually relevant."""
results = collection.query(query_texts=[query], n_results=top_k)
retrieved_ids = results["ids"][0]
relevant_in_top_k = len(set(retrieved_ids) & set(relevant_doc_ids))
return relevant_in_top_k / len(relevant_doc_ids)
# Golden retrieval test set
RETRIEVAL_TESTS = [
{
"query": "how to configure retries in playwright",
"relevant_doc_ids": ["playwright-config", "playwright-retries", "playwright-advanced"],
},
{
"query": "docker multi-stage build example",
"relevant_doc_ids": ["docker-multistage", "docker-optimization"],
},
]
def test_retrieval_quality():
precisions = []
for test in RETRIEVAL_TESTS:
precision = test_retrieval_precision(
query=test["query"],
relevant_doc_ids=test["relevant_doc_ids"]
)
precisions.append(precision)
assert precision >= 0.6, (
f"Retrieval precision {precision:.2f} too low for query: {test['query']}"
)
avg_precision = sum(precisions) / len(precisions)
print(f"Average retrieval precision: {avg_precision:.2f}")Semantic Change Detection
Detect when application responses change meaning between deployments:
import json
from pathlib import Path
from datetime import datetime
class SemanticChangeDetector:
def __init__(self, baseline_file: str = "semantic_baseline.json"):
self.baseline_file = Path(baseline_file)
self.baseline = self._load()
def _load(self) -> dict:
if self.baseline_file.exists():
return json.loads(self.baseline_file.read_text())
return {}
def capture_baseline(self, test_inputs: list[dict], response_fn):
"""Run inputs, embed responses, save as baseline."""
baseline = {}
for item in test_inputs:
response = response_fn(item["input"])
baseline[item["id"]] = {
"input": item["input"],
"response": response,
"embedding": embed(response),
"captured_at": datetime.utcnow().isoformat(),
}
self.baseline_file.write_text(json.dumps(baseline, indent=2))
self.baseline = baseline
print(f"Baseline captured for {len(baseline)} inputs")
def detect_changes(self, response_fn, threshold: float = 0.90) -> list[dict]:
"""Compare current responses against baseline."""
regressions = []
for item_id, baseline_data in self.baseline.items():
current_response = response_fn(baseline_data["input"])
similarity = cosine_similarity(
embed(current_response),
baseline_data["embedding"]
)
if similarity < threshold:
regressions.append({
"id": item_id,
"input": baseline_data["input"][:100],
"baseline_response": baseline_data["response"][:100],
"current_response": current_response[:100],
"similarity": similarity,
})
return regressions
# Usage
detector = SemanticChangeDetector("baseline.json")
# Before a model/prompt update: capture baseline
detector.capture_baseline(
test_inputs=[{"id": "q1", "input": "What is Playwright?"}],
response_fn=lambda q: summarizer.answer(q)
)
# After the update: detect regressions
regressions = detector.detect_changes(lambda q: summarizer.answer(q))
if regressions:
for r in regressions:
print(f"Semantic regression in {r['id']}: similarity={r['similarity']:.2f}")Semantic testing with embeddings won't replace traditional assertions for deterministic behavior, but it fills the gap for any output that has multiple valid forms. Combined with a golden dataset and similarity thresholds tuned to your domain, it provides a quantitative safety net against meaning-level regressions.