← All articles

Using the Claude API for Intelligent Test Automation

January 25, 2026 · 6 min read

aipythontesting

The Claude API goes beyond simple text generation. Tool use enables structured, multi-step workflows. The vision API understands screenshots. Streaming handles long-running operations. These capabilities combine to create test automation that adapts to application behavior rather than brittle scripted interactions.

Anthropic Python SDK Basics

pip install anthropic
import anthropic
import os
 
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
 
# Basic message
message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": "What makes a good test case?"}
    ]
)
print(message.content[0].text)

Generating Test Cases with Tool Use

Tool use provides reliable structured output — Claude calls the tool with properly typed arguments:

import anthropic
import json
 
client = anthropic.Anthropic()
 
tools = [
    {
        "name": "create_test_cases",
        "description": "Generate test cases for a feature description",
        "input_schema": {
            "type": "object",
            "required": ["test_cases"],
            "properties": {
                "test_cases": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "required": ["id", "title", "preconditions", "steps", "expected_result", "priority"],
                        "properties": {
                            "id": {"type": "string", "description": "TC-001 format"},
                            "title": {"type": "string"},
                            "preconditions": {"type": "array", "items": {"type": "string"}},
                            "steps": {"type": "array", "items": {"type": "string"}},
                            "expected_result": {"type": "string"},
                            "priority": {"type": "string", "enum": ["P0", "P1", "P2", "P3"]},
                            "test_type": {"type": "string", "enum": ["functional", "security", "performance", "usability"]}
                        }
                    }
                }
            }
        }
    }
]
 
def generate_test_cases(feature_description: str, existing_cases: int = 0) -> list[dict]:
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=4000,
        tools=tools,
        tool_choice={"type": "tool", "name": "create_test_cases"},
        messages=[{
            "role": "user",
            "content": f"""Generate comprehensive test cases for this feature.
 
Feature: {feature_description}
 
Requirements:
- Cover happy path, error cases, edge cases, and security scenarios
- Use TC-{existing_cases + 1:03d} numbering
- Be specific about steps and expected results
- Include at least one P0 (critical) test case"""
        }]
    )
 
    for block in response.content:
        if block.type == "tool_use":
            return block.input["test_cases"]
 
    return []
 
# Usage
feature = """
User authentication with two-factor authentication (2FA).
Users can enable 2FA via TOTP (authenticator apps) or SMS.
After password login, users with 2FA enabled must provide a valid 6-digit code.
Failed 2FA attempts are rate-limited to 5 per hour.
"""
 
cases = generate_test_cases(feature)
for case in cases:
    print(f"[{case['priority']}] {case['id']}: {case['title']}")

Vision API for UI Validation

Pass screenshots to Claude for intelligent visual analysis:

import anthropic
import base64
from pathlib import Path
 
client = anthropic.Anthropic()
 
def analyze_screenshot(screenshot_path: str, context: str) -> dict:
    image_data = Path(screenshot_path).read_bytes()
    image_base64 = base64.standard_b64encode(image_data).decode("utf-8")
 
    # Detect media type from extension
    ext = Path(screenshot_path).suffix.lower()
    media_types = {".png": "image/png", ".jpg": "image/jpeg", ".webp": "image/webp"}
    media_type = media_types.get(ext, "image/png")
 
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1000,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": media_type,
                        "data": image_base64,
                    },
                },
                {
                    "type": "text",
                    "text": f"""Analyze this UI screenshot for the following context:
{context}
 
Identify:
1. Any visual errors, broken layouts, or misaligned elements
2. Missing content that should be visible
3. Accessibility concerns (contrast, text size)
4. Any text that appears truncated or overflowing
 
Return JSON: {{"issues": [{{"type": str, "severity": "high|medium|low", "description": str, "location": str}}], "overall_status": "pass|fail|warning"}}"""
                }
            ],
        }]
    )
 
    return json.loads(response.content[0].text)
 
# Use in Playwright tests
from playwright.sync_api import sync_playwright
 
with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    page.goto("http://localhost:3000/dashboard")
    page.screenshot(path="/tmp/dashboard.png")
 
    analysis = analyze_screenshot(
        "/tmp/dashboard.png",
        "Dashboard showing user analytics with charts and a navigation sidebar"
    )
 
    if analysis["overall_status"] == "fail":
        issues = "\n".join([f"- [{i['severity']}] {i['description']}" for i in analysis["issues"]])
        raise AssertionError(f"Visual issues detected:\n{issues}")
 
    browser.close()

Multi-Turn Conversations for Test Planning

Use multi-turn conversations to refine test strategies interactively:

def interactive_test_planner(requirements: str) -> list[str]:
    """Multi-turn conversation to develop a test plan."""
    messages = []
 
    # Initial context
    messages.append({
        "role": "user",
        "content": f"I need to test a new feature. Here are the requirements:\n\n{requirements}\n\nWhat are the key test scenarios we should cover?"
    })
 
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1000,
        system="You are a senior QA engineer helping plan a test strategy. Ask clarifying questions when requirements are ambiguous.",
        messages=messages
    )
 
    assistant_response = response.content[0].text
    messages.append({"role": "assistant", "content": assistant_response})
 
    # Follow-up: request prioritization
    messages.append({
        "role": "user",
        "content": "Which of these are P0 scenarios that must pass before release? Which can be deferred to regression?"
    })
 
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1000,
        messages=messages
    )
 
    final_plan = response.content[0].text
    return final_plan

Streaming for Long Operations

For long report generation, stream the response:

def generate_test_report_streaming(test_results: list[dict]) -> None:
    """Stream a detailed test report."""
    results_str = json.dumps(test_results, indent=2)
 
    with client.messages.stream(
        model="claude-opus-4-6",
        max_tokens=4000,
        messages=[{
            "role": "user",
            "content": f"""Generate a detailed test execution report for these results.
Include:
1. Executive summary (pass/fail rates, critical failures)
2. Detailed analysis of each failed test
3. Root cause hypotheses for failure patterns
4. Recommended actions before release
 
Test results:
{results_str}"""
        }]
    ) as stream:
        for text in stream.text_stream:
            print(text, end="", flush=True)
    print()  # Final newline

Intelligent Bug Report Generation

Automatically generate bug reports from test failures:

def generate_bug_report(
    test_name: str,
    error_message: str,
    screenshot_path: str | None,
    browser_logs: list[str],
) -> dict:
    """Generate a structured bug report from test failure data."""
 
    content = [
        {
            "type": "text",
            "text": f"""A Playwright test failed. Generate a structured bug report.
 
Test: {test_name}
Error: {error_message}
Browser console logs:
{chr(10).join(browser_logs[:20])}
 
Generate JSON:
{{
  "title": "concise bug title",
  "severity": "critical|high|medium|low",
  "description": "what happened",
  "steps_to_reproduce": ["step 1", "step 2"],
  "expected_behavior": "...",
  "actual_behavior": "...",
  "possible_causes": ["cause 1", "cause 2"],
  "suggested_fix": "...",
  "labels": ["bug", "ui", "auth", etc]
}}"""
        }
    ]
 
    # Add screenshot if available
    if screenshot_path:
        image_data = base64.standard_b64encode(Path(screenshot_path).read_bytes()).decode()
        content.insert(0, {
            "type": "image",
            "source": {"type": "base64", "media_type": "image/png", "data": image_data}
        })
 
    response = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1500,
        messages=[{"role": "user", "content": content}]
    )
 
    return json.loads(response.content[0].text)
 
# Integrate with Playwright test results
def on_test_failure(test_name: str, error: Exception, page):
    screenshot = f"/tmp/failure-{test_name}.png"
    page.screenshot(path=screenshot)
 
    logs = page.evaluate("() => window.__consoleLogs || []")
 
    report = generate_bug_report(
        test_name=test_name,
        error_message=str(error),
        screenshot_path=screenshot,
        browser_logs=logs,
    )
 
    # Post to GitHub Issues, Jira, Linear, etc.
    print(f"Bug report generated: [{report['severity']}] {report['title']}")
    return report

Rate Limiting and Cost Control

from anthropic import RateLimitError, APIStatusError
import time
 
def call_with_retry(func, max_retries: int = 3, base_delay: float = 1.0):
    """Retry with exponential backoff on rate limits."""
    for attempt in range(max_retries):
        try:
            return func()
        except RateLimitError:
            if attempt == max_retries - 1:
                raise
            delay = base_delay * (2 ** attempt)
            print(f"Rate limited. Waiting {delay}s...")
            time.sleep(delay)
        except APIStatusError as e:
            if e.status_code >= 500 and attempt < max_retries - 1:
                time.sleep(base_delay * (2 ** attempt))
            else:
                raise
 
# Track token usage for cost monitoring
def track_usage(response) -> dict:
    usage = response.usage
    cost_per_mtok_input = 3.00   # claude-opus-4-6 pricing ($/MTok)
    cost_per_mtok_output = 15.00
 
    input_cost = (usage.input_tokens / 1_000_000) * cost_per_mtok_input
    output_cost = (usage.output_tokens / 1_000_000) * cost_per_mtok_output
 
    return {
        "input_tokens": usage.input_tokens,
        "output_tokens": usage.output_tokens,
        "total_cost_usd": round(input_cost + output_cost, 6),
    }

The Claude API's combination of tool use, vision, and streaming makes it a versatile foundation for intelligent test automation. The key is identifying which steps in your testing workflow have deterministic criteria (traditional assertions) and which benefit from semantic understanding (LLM analysis).