Batch AI Processing with Modal: Parallel Execution and Cost Control

How to process thousands of documents, images, or API calls in parallel without managing workers

The batch processing use case

Hosted LLM APIs process one request at a time unless you parallelize explicitly. Modal's .map() and .starmap() functions fan out across hundreds of parallel containers automatically — turning a 10-hour sequential job into a 10-minute parallel one.

Basic parallel map

import modal
 
app = modal.App('batch-classifier')
 
image = modal.Image.debian_slim().pip_install('openai')
 
@app.function(image=image, timeout=60)
def classify_document(doc: str) -> dict:
    """Classify a single document — Modal will run many of these in parallel."""
    from openai import OpenAI
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{
            'role': 'user',
            'content': f'Classify this as POSITIVE, NEGATIVE or NEUTRAL: {doc[:500]}'
        }],
        max_tokens=10,
    )
    return {'doc': doc[:50], 'label': response.choices[0].message.content.strip()}
 
@app.local_entrypoint()
def main():
    documents = [
        'This product is amazing, best purchase ever!',
        'Terrible quality, broke after one day.',
        'It arrived on time and works as described.',
        # ... thousands more documents
    ] * 100  # simulate 300 documents
 
    # .map() fans out across parallel containers automatically
    results = list(classify_document.map(documents))
 
    positive = sum(1 for r in results if 'POSITIVE' in r['label'])
    print(f'Processed {len(results)} docs: {positive} positive')

modal run batch_classifier.py

.map() returns a generator — wrap with list() to wait for all results, or iterate as results arrive. For very large batches, iterating is more memory-efficient than collecting into a list.

Controlling parallelism

# Limit to 10 concurrent containers to avoid rate limit issues
results = list(classify_document.map(
    documents,
    order_outputs=False,     # don't wait for ordering — faster
    return_exceptions=True,  # don't fail the whole batch on one error
))
 
# Or use starmap for multiple arguments
@app.function(image=image)
def translate_doc(text: str, target_language: str) -> str:
    pass  # LLM translation logic
 
translations = list(translate_doc.starmap([
    ('Hello world', 'Spanish'),
    ('Good morning', 'French'),
    ('Thank you', 'German'),
]))

Secrets management

Store API keys as Modal Secrets — they are injected as environment variables at runtime without being in your code.

# Create a secret
modal secret create openai-secret OPENAI_API_KEY=sk-proj-...

import modal
import os
 
@app.function(
    image=image,
    secrets=[modal.Secret.from_name('openai-secret')],  # inject secret
)
def call_openai(prompt: str) -> str:
    from openai import OpenAI
    # OPENAI_API_KEY is available via os.environ automatically
    client = OpenAI()  # picks up from environment
    return client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': prompt}]
    ).choices[0].message.content

Processing from a file or database

import modal
 
# Use a Modal Volume to share input/output files across containers
volume = modal.Volume.from_name('batch-data', create_if_missing=True)
 
@app.function(
    image=image,
    volumes={'/data': volume},
    secrets=[modal.Secret.from_name('openai-secret')],
)
def process_row(row_id: int) -> dict:
    import json
    # Read input from volume
    with open(f'/data/input/{row_id}.json') as f:
        row = json.load(f)
 
    # Process
    from openai import OpenAI
    client = OpenAI()
    result = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': row['text']}]
    ).choices[0].message.content
 
    # Write output to volume
    with open(f'/data/output/{row_id}.json', 'w') as f:
        json.dump({'id': row_id, 'result': result}, f)
 
    return {'id': row_id, 'status': 'done'}

Scheduled batch jobs with Cron

# Run batch processing every night at 2am UTC
@app.function(
    image=image,
    schedule=modal.Cron('0 2 * * *'),
    secrets=[modal.Secret.from_name('openai-secret')],
)
def nightly_classification():
    # Fetch new documents from database
    docs = fetch_unprocessed_documents()
 
    # Process in parallel
    results = list(classify_document.map(docs))
 
    # Save results
    save_results_to_db(results)
    print(f'Nightly job: processed {len(results)} documents')

Scenario	Modal	Always-on GPU server
1 hour batch job, T4	~$0.59	Same (but server never idles free)
Daily 30-min job, T4	~$9/month	~$430/month (always-on)
Occasional inference	Pay per call	Full server cost regardless
Burst to 100 parallel jobs	Scales instantly	Need to pre-provision

Modal's economics are excellent for batch jobs and bursty workloads. If you need sub-second latency on continuous traffic (thousands of requests per minute), a dedicated GPU server may be cheaper.

Error handling in batch jobs

@app.local_entrypoint()
def main():
    documents = load_documents()
 
    results = []
    errors = []
 
    # return_exceptions=True prevents one failure from stopping everything
    for i, result in enumerate(classify_document.map(
        documents,
        return_exceptions=True
    )):
        if isinstance(result, Exception):
            errors.append({'index': i, 'error': str(result)})
        else:
            results.append(result)
 
    print(f'Success: {len(results)}, Errors: {len(errors)}')
    if errors:
        print('Failed items:', errors[:5])

Batch AI Processing with Modal: Parallel Execution and Cost Control

The batch processing use case

Basic parallel map

Controlling parallelism

Secrets management

Processing from a file or database

Scheduled batch jobs with Cron

Error handling in batch jobs

Give feedback on this guide

Stay sharp as AI tools evolve

Batch AI Processing with Modal: Parallel Execution and Cost Control

The batch processing use case

Basic parallel map

Controlling parallelism

Secrets management

Processing from a file or database

Scheduled batch jobs with Cron

Cost comparison: Modal vs self-managed GPU

Error handling in batch jobs

Give feedback on this guide

Stay sharp as AI tools evolve