How to write fast, reliable tests for ADK agents using InMemoryRunner and mocks
Why ADK testing is different
Testing AI agents is harder than testing regular code because LLM outputs are non-deterministic and live API calls are slow and expensive. ADK gives you InMemoryRunner as a lightweight test harness and dependency injection patterns that make mocking straightforward.
InMemoryRunner: the test harness
InMemoryRunner runs your agent entirely in memory without any persistence or external infrastructure. It is the right choice for unit tests.
import asyncio
import pytest
from google.adk.agents import LlmAgent
from google.adk.runners import InMemoryRunner
from google.genai import types
# The agent under test
greeter = LlmAgent(
name='greeter',
model='gemini-2.0-flash',
instruction='Greet the user warmly and ask how you can help.',
)
async def run_agent(message: str) -> str:
runner = InMemoryRunner(agent=greeter, app_name='test_app')
session = await runner.session_service.create_session(
app_name='test_app', user_id='test_user'
)
content = types.Content(
role='user',
parts=[types.Part(text=message)]
)
final_response = ''
async for event in runner.run_async(
user_id='test_user',
session_id=session.id,
new_message=content
):
if event.is_final_response() and event.content:
for part in event.content.parts:
if part.text:
final_response += part.text
return final_response
@pytest.mark.asyncio
async def test_greeter_responds():
response = await run_agent('Hello!')
assert len(response) > 0
# Fuzzy assertion — check sentiment rather than exact words
assert any(word in response.lower() for word in ['hello', 'hi', 'welcome', 'greet'])
Test for structure and intent, not exact strings. LLM outputs vary between runs. Assert that the response contains expected information, not that it matches a specific phrase.Mocking tools for deterministic tests
The main strategy for deterministic testing: replace real tool implementations with mocked versions that return predictable outputs.
from google.adk.tools import FunctionTool
from unittest.mock import patch
# Real tool (makes HTTP call)
async def fetch_weather(city: str) -> dict:
"""Fetches current weather for a city."""
# ... real implementation
pass
weather_tool = FunctionTool(fetch_weather)
weather_agent = LlmAgent(
name='weather_agent',
model='gemini-2.0-flash',
instruction='Answer weather questions.',
tools=[weather_tool],
)
@pytest.mark.asyncio
async def test_weather_agent_uses_tool():
mock_weather = {'temperature_c': 20, 'condition': 'Sunny', 'humidity_pct': 55}
with patch('mymodule.fetch_weather', return_value=mock_weather):
response = await run_agent_with(weather_agent, 'What is the weather in Paris?')
assert '20' in response or 'sunny' in response.lower()
Testing tool call events directly
Instead of asserting on the final text response, you can assert on the tool call events — checking that the LLM invoked the right tool with the right arguments.
@pytest.mark.asyncio
async def test_agent_calls_correct_tool():
runner = InMemoryRunner(agent=weather_agent, app_name='test_app')
session = await runner.session_service.create_session(
app_name='test_app', user_id='u1'
)
content = types.Content(role='user', parts=[types.Part(text='Weather in Tokyo?')])
tool_calls_seen = []
async for event in runner.run_async(user_id='u1', session_id=session.id, new_message=content):
calls = event.get_function_calls()
if calls:
tool_calls_seen.extend(calls)
# Assert tool was called
assert any(c.name == 'fetch_weather' for c in tool_calls_seen), \
'Agent should have called fetch_weather'
# Assert correct argument
weather_calls = [c for c in tool_calls_seen if c.name == 'fetch_weather']
assert any('tokyo' in str(c.args).lower() for c in weather_calls), \
'Should have called fetch_weather with Tokyo'
Testing multi-agent systems
For multi-agent setups (SequentialAgent, ParallelAgent, or custom orchestrators), test each subagent in isolation first, then test the composite system.
from google.adk.agents import SequentialAgent
step1 = LlmAgent(name='step1', model='gemini-2.0-flash',
instruction='Extract the main topic from the text.')
step2 = LlmAgent(name='step2', model='gemini-2.0-flash',
instruction='Write a one-paragraph summary of the topic.')
pipeline = SequentialAgent(name='pipeline', sub_agents=[step1, step2])
@pytest.mark.asyncio
async def test_pipeline_produces_summary():
response = await run_agent_with(pipeline, 'Article: AI is transforming healthcare...')
# Should be a coherent paragraph
assert len(response.split()) > 20, 'Summary should be a full paragraph'
Pytest fixtures for reuse
import pytest
from google.adk.runners import InMemoryRunner
@pytest.fixture
def runner(weather_agent):
return InMemoryRunner(agent=weather_agent, app_name='test_app')
@pytest.fixture
async def session(runner):
return await runner.session_service.create_session(
app_name='test_app', user_id='test_user'
)
@pytest.mark.asyncio
async def test_with_fixtures(runner, session):
content = types.Content(role='user', parts=[types.Part(text='Hi')])
responses = []
async for event in runner.run_async(
user_id='test_user', session_id=session.id, new_message=content
):
if event.is_final_response():
responses.append(event)
assert len(responses) > 0
CI/CD integration
Add these practices when running ADK tests in CI:
- Set GOOGLE_API_KEY as a CI secret — InMemoryRunner still calls Gemini unless you mock the LLM
- Use pytest-asyncio with asyncio_mode = auto in pytest.ini for cleaner async tests
- Mock expensive tools (web search, code execution) in all CI tests
- Run live integration tests nightly on a separate CI job, not on every PR
- Use pytest -x --tb=short to stop at first failure and keep CI output readable
# pytest.ini
[pytest]
asyncio_mode = auto
markers =
integration: marks tests as integration tests (deselect with -m 'not integration')
# Run only unit tests in CI
pytest -m 'not integration' tests/
# Run integration tests separately
pytest -m integration tests/ --timeout=60
What to test and what not to
| Test this | Do not test this |
|---|---|
| Agent calls the right tool given a specific prompt | Exact wording of LLM responses |
| Tool functions return correct data given inputs | That the LLM chose a specific phrasing |
| Multi-agent routing sends tasks to correct subagent | Internal LLM reasoning steps |
| Agent handles tool errors gracefully | Token count of a response |
| Output contains required information (fuzzy assert) | Exact JSON structure from a freeform LLM call |