Using the Claude API for Intelligent Test Automation
January 25, 2026 · 6 min read
The Claude API goes beyond simple text generation. Tool use enables structured, multi-step workflows. The vision API understands screenshots. Streaming handles long-running operations. These capabilities combine to create test automation that adapts to application behavior rather than brittle scripted interactions.
Anthropic Python SDK Basics
pip install anthropicimport anthropic
import os
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
# Basic message
message = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[
{"role": "user", "content": "What makes a good test case?"}
]
)
print(message.content[0].text)Generating Test Cases with Tool Use
Tool use provides reliable structured output — Claude calls the tool with properly typed arguments:
import anthropic
import json
client = anthropic.Anthropic()
tools = [
{
"name": "create_test_cases",
"description": "Generate test cases for a feature description",
"input_schema": {
"type": "object",
"required": ["test_cases"],
"properties": {
"test_cases": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "title", "preconditions", "steps", "expected_result", "priority"],
"properties": {
"id": {"type": "string", "description": "TC-001 format"},
"title": {"type": "string"},
"preconditions": {"type": "array", "items": {"type": "string"}},
"steps": {"type": "array", "items": {"type": "string"}},
"expected_result": {"type": "string"},
"priority": {"type": "string", "enum": ["P0", "P1", "P2", "P3"]},
"test_type": {"type": "string", "enum": ["functional", "security", "performance", "usability"]}
}
}
}
}
}
}
]
def generate_test_cases(feature_description: str, existing_cases: int = 0) -> list[dict]:
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=4000,
tools=tools,
tool_choice={"type": "tool", "name": "create_test_cases"},
messages=[{
"role": "user",
"content": f"""Generate comprehensive test cases for this feature.
Feature: {feature_description}
Requirements:
- Cover happy path, error cases, edge cases, and security scenarios
- Use TC-{existing_cases + 1:03d} numbering
- Be specific about steps and expected results
- Include at least one P0 (critical) test case"""
}]
)
for block in response.content:
if block.type == "tool_use":
return block.input["test_cases"]
return []
# Usage
feature = """
User authentication with two-factor authentication (2FA).
Users can enable 2FA via TOTP (authenticator apps) or SMS.
After password login, users with 2FA enabled must provide a valid 6-digit code.
Failed 2FA attempts are rate-limited to 5 per hour.
"""
cases = generate_test_cases(feature)
for case in cases:
print(f"[{case['priority']}] {case['id']}: {case['title']}")Vision API for UI Validation
Pass screenshots to Claude for intelligent visual analysis:
import anthropic
import base64
from pathlib import Path
client = anthropic.Anthropic()
def analyze_screenshot(screenshot_path: str, context: str) -> dict:
image_data = Path(screenshot_path).read_bytes()
image_base64 = base64.standard_b64encode(image_data).decode("utf-8")
# Detect media type from extension
ext = Path(screenshot_path).suffix.lower()
media_types = {".png": "image/png", ".jpg": "image/jpeg", ".webp": "image/webp"}
media_type = media_types.get(ext, "image/png")
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1000,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_base64,
},
},
{
"type": "text",
"text": f"""Analyze this UI screenshot for the following context:
{context}
Identify:
1. Any visual errors, broken layouts, or misaligned elements
2. Missing content that should be visible
3. Accessibility concerns (contrast, text size)
4. Any text that appears truncated or overflowing
Return JSON: {{"issues": [{{"type": str, "severity": "high|medium|low", "description": str, "location": str}}], "overall_status": "pass|fail|warning"}}"""
}
],
}]
)
return json.loads(response.content[0].text)
# Use in Playwright tests
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto("http://localhost:3000/dashboard")
page.screenshot(path="/tmp/dashboard.png")
analysis = analyze_screenshot(
"/tmp/dashboard.png",
"Dashboard showing user analytics with charts and a navigation sidebar"
)
if analysis["overall_status"] == "fail":
issues = "\n".join([f"- [{i['severity']}] {i['description']}" for i in analysis["issues"]])
raise AssertionError(f"Visual issues detected:\n{issues}")
browser.close()Multi-Turn Conversations for Test Planning
Use multi-turn conversations to refine test strategies interactively:
def interactive_test_planner(requirements: str) -> list[str]:
"""Multi-turn conversation to develop a test plan."""
messages = []
# Initial context
messages.append({
"role": "user",
"content": f"I need to test a new feature. Here are the requirements:\n\n{requirements}\n\nWhat are the key test scenarios we should cover?"
})
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1000,
system="You are a senior QA engineer helping plan a test strategy. Ask clarifying questions when requirements are ambiguous.",
messages=messages
)
assistant_response = response.content[0].text
messages.append({"role": "assistant", "content": assistant_response})
# Follow-up: request prioritization
messages.append({
"role": "user",
"content": "Which of these are P0 scenarios that must pass before release? Which can be deferred to regression?"
})
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1000,
messages=messages
)
final_plan = response.content[0].text
return final_planStreaming for Long Operations
For long report generation, stream the response:
def generate_test_report_streaming(test_results: list[dict]) -> None:
"""Stream a detailed test report."""
results_str = json.dumps(test_results, indent=2)
with client.messages.stream(
model="claude-opus-4-6",
max_tokens=4000,
messages=[{
"role": "user",
"content": f"""Generate a detailed test execution report for these results.
Include:
1. Executive summary (pass/fail rates, critical failures)
2. Detailed analysis of each failed test
3. Root cause hypotheses for failure patterns
4. Recommended actions before release
Test results:
{results_str}"""
}]
) as stream:
for text in stream.text_stream:
print(text, end="", flush=True)
print() # Final newlineIntelligent Bug Report Generation
Automatically generate bug reports from test failures:
def generate_bug_report(
test_name: str,
error_message: str,
screenshot_path: str | None,
browser_logs: list[str],
) -> dict:
"""Generate a structured bug report from test failure data."""
content = [
{
"type": "text",
"text": f"""A Playwright test failed. Generate a structured bug report.
Test: {test_name}
Error: {error_message}
Browser console logs:
{chr(10).join(browser_logs[:20])}
Generate JSON:
{{
"title": "concise bug title",
"severity": "critical|high|medium|low",
"description": "what happened",
"steps_to_reproduce": ["step 1", "step 2"],
"expected_behavior": "...",
"actual_behavior": "...",
"possible_causes": ["cause 1", "cause 2"],
"suggested_fix": "...",
"labels": ["bug", "ui", "auth", etc]
}}"""
}
]
# Add screenshot if available
if screenshot_path:
image_data = base64.standard_b64encode(Path(screenshot_path).read_bytes()).decode()
content.insert(0, {
"type": "image",
"source": {"type": "base64", "media_type": "image/png", "data": image_data}
})
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1500,
messages=[{"role": "user", "content": content}]
)
return json.loads(response.content[0].text)
# Integrate with Playwright test results
def on_test_failure(test_name: str, error: Exception, page):
screenshot = f"/tmp/failure-{test_name}.png"
page.screenshot(path=screenshot)
logs = page.evaluate("() => window.__consoleLogs || []")
report = generate_bug_report(
test_name=test_name,
error_message=str(error),
screenshot_path=screenshot,
browser_logs=logs,
)
# Post to GitHub Issues, Jira, Linear, etc.
print(f"Bug report generated: [{report['severity']}] {report['title']}")
return reportRate Limiting and Cost Control
from anthropic import RateLimitError, APIStatusError
import time
def call_with_retry(func, max_retries: int = 3, base_delay: float = 1.0):
"""Retry with exponential backoff on rate limits."""
for attempt in range(max_retries):
try:
return func()
except RateLimitError:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt)
print(f"Rate limited. Waiting {delay}s...")
time.sleep(delay)
except APIStatusError as e:
if e.status_code >= 500 and attempt < max_retries - 1:
time.sleep(base_delay * (2 ** attempt))
else:
raise
# Track token usage for cost monitoring
def track_usage(response) -> dict:
usage = response.usage
cost_per_mtok_input = 3.00 # claude-opus-4-6 pricing ($/MTok)
cost_per_mtok_output = 15.00
input_cost = (usage.input_tokens / 1_000_000) * cost_per_mtok_input
output_cost = (usage.output_tokens / 1_000_000) * cost_per_mtok_output
return {
"input_tokens": usage.input_tokens,
"output_tokens": usage.output_tokens,
"total_cost_usd": round(input_cost + output_cost, 6),
}The Claude API's combination of tool use, vision, and streaming makes it a versatile foundation for intelligent test automation. The key is identifying which steps in your testing workflow have deterministic criteria (traditional assertions) and which benefit from semantic understanding (LLM analysis).