How to process thousands of documents, images, or API calls in parallel without managing workers
The batch processing use case
Hosted LLM APIs process one request at a time unless you parallelize explicitly. Modal's .map() and .starmap() functions fan out across hundreds of parallel containers automatically — turning a 10-hour sequential job into a 10-minute parallel one.
Basic parallel map
import modal
app = modal.App('batch-classifier')
image = modal.Image.debian_slim().pip_install('openai')
@app.function(image=image, timeout=60)
def classify_document(doc: str) -> dict:
"""Classify a single document — Modal will run many of these in parallel."""
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model='gpt-4o-mini',
messages=[{
'role': 'user',
'content': f'Classify this as POSITIVE, NEGATIVE or NEUTRAL: {doc[:500]}'
}],
max_tokens=10,
)
return {'doc': doc[:50], 'label': response.choices[0].message.content.strip()}
@app.local_entrypoint()
def main():
documents = [
'This product is amazing, best purchase ever!',
'Terrible quality, broke after one day.',
'It arrived on time and works as described.',
# ... thousands more documents
] * 100 # simulate 300 documents
# .map() fans out across parallel containers automatically
results = list(classify_document.map(documents))
positive = sum(1 for r in results if 'POSITIVE' in r['label'])
print(f'Processed {len(results)} docs: {positive} positive')
modal run batch_classifier.py
.map() returns a generator — wrap with list() to wait for all results, or iterate as results arrive. For very large batches, iterating is more memory-efficient than collecting into a list.Controlling parallelism
# Limit to 10 concurrent containers to avoid rate limit issues
results = list(classify_document.map(
documents,
order_outputs=False, # don't wait for ordering — faster
return_exceptions=True, # don't fail the whole batch on one error
))
# Or use starmap for multiple arguments
@app.function(image=image)
def translate_doc(text: str, target_language: str) -> str:
pass # LLM translation logic
translations = list(translate_doc.starmap([
('Hello world', 'Spanish'),
('Good morning', 'French'),
('Thank you', 'German'),
]))
Secrets management
Store API keys as Modal Secrets — they are injected as environment variables at runtime without being in your code.
# Create a secret
modal secret create openai-secret OPENAI_API_KEY=sk-proj-...
import modal
import os
@app.function(
image=image,
secrets=[modal.Secret.from_name('openai-secret')], # inject secret
)
def call_openai(prompt: str) -> str:
from openai import OpenAI
# OPENAI_API_KEY is available via os.environ automatically
client = OpenAI() # picks up from environment
return client.chat.completions.create(
model='gpt-4o-mini',
messages=[{'role': 'user', 'content': prompt}]
).choices[0].message.content
Processing from a file or database
import modal
# Use a Modal Volume to share input/output files across containers
volume = modal.Volume.from_name('batch-data', create_if_missing=True)
@app.function(
image=image,
volumes={'/data': volume},
secrets=[modal.Secret.from_name('openai-secret')],
)
def process_row(row_id: int) -> dict:
import json
# Read input from volume
with open(f'/data/input/{row_id}.json') as f:
row = json.load(f)
# Process
from openai import OpenAI
client = OpenAI()
result = client.chat.completions.create(
model='gpt-4o-mini',
messages=[{'role': 'user', 'content': row['text']}]
).choices[0].message.content
# Write output to volume
with open(f'/data/output/{row_id}.json', 'w') as f:
json.dump({'id': row_id, 'result': result}, f)
return {'id': row_id, 'status': 'done'}
Scheduled batch jobs with Cron
# Run batch processing every night at 2am UTC
@app.function(
image=image,
schedule=modal.Cron('0 2 * * *'),
secrets=[modal.Secret.from_name('openai-secret')],
)
def nightly_classification():
# Fetch new documents from database
docs = fetch_unprocessed_documents()
# Process in parallel
results = list(classify_document.map(docs))
# Save results
save_results_to_db(results)
print(f'Nightly job: processed {len(results)} documents')
Cost comparison: Modal vs self-managed GPU
| Scenario | Modal | Always-on GPU server |
|---|---|---|
| 1 hour batch job, T4 | ~$0.59 | Same (but server never idles free) |
| Daily 30-min job, T4 | ~$9/month | ~$430/month (always-on) |
| Occasional inference | Pay per call | Full server cost regardless |
| Burst to 100 parallel jobs | Scales instantly | Need to pre-provision |
Modal's economics are excellent for batch jobs and bursty workloads. If you need sub-second latency on continuous traffic (thousands of requests per minute), a dedicated GPU server may be cheaper.Error handling in batch jobs
@app.local_entrypoint()
def main():
documents = load_documents()
results = []
errors = []
# return_exceptions=True prevents one failure from stopping everything
for i, result in enumerate(classify_document.map(
documents,
return_exceptions=True
)):
if isinstance(result, Exception):
errors.append({'index': i, 'error': str(result)})
else:
results.append(result)
print(f'Success: {len(results)}, Errors: {len(errors)}')
if errors:
print('Failed items:', errors[:5])