How to process thousands of documents, images, or API calls in parallel without managing workers

The batch processing use case

Hosted LLM APIs process one request at a time unless you parallelize explicitly. Modal's .map() and .starmap() functions fan out across hundreds of parallel containers automatically — turning a 10-hour sequential job into a 10-minute parallel one.

Basic parallel map

import modal
 
app = modal.App('batch-classifier')
 
image = modal.Image.debian_slim().pip_install('openai')
 
@app.function(image=image, timeout=60)
def classify_document(doc: str) -> dict:
    """Classify a single document — Modal will run many of these in parallel."""
    from openai import OpenAI
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{
            'role': 'user',
            'content': f'Classify this as POSITIVE, NEGATIVE or NEUTRAL: {doc[:500]}'
        }],
        max_tokens=10,
    )
    return {'doc': doc[:50], 'label': response.choices[0].message.content.strip()}
 
@app.local_entrypoint()
def main():
    documents = [
        'This product is amazing, best purchase ever!',
        'Terrible quality, broke after one day.',
        'It arrived on time and works as described.',
        # ... thousands more documents
    ] * 100  # simulate 300 documents
 
    # .map() fans out across parallel containers automatically
    results = list(classify_document.map(documents))
 
    positive = sum(1 for r in results if 'POSITIVE' in r['label'])
    print(f'Processed {len(results)} docs: {positive} positive')
 
modal run batch_classifier.py
 
.map() returns a generator — wrap with list() to wait for all results, or iterate as results arrive. For very large batches, iterating is more memory-efficient than collecting into a list.

Controlling parallelism

# Limit to 10 concurrent containers to avoid rate limit issues
results = list(classify_document.map(
    documents,
    order_outputs=False,     # don't wait for ordering — faster
    return_exceptions=True,  # don't fail the whole batch on one error
))
 
# Or use starmap for multiple arguments
@app.function(image=image)
def translate_doc(text: str, target_language: str) -> str:
    pass  # LLM translation logic
 
translations = list(translate_doc.starmap([
    ('Hello world', 'Spanish'),
    ('Good morning', 'French'),
    ('Thank you', 'German'),
]))
 

Secrets management

Store API keys as Modal Secrets — they are injected as environment variables at runtime without being in your code.

# Create a secret
modal secret create openai-secret OPENAI_API_KEY=sk-proj-...
 
import modal
import os
 
@app.function(
    image=image,
    secrets=[modal.Secret.from_name('openai-secret')],  # inject secret
)
def call_openai(prompt: str) -> str:
    from openai import OpenAI
    # OPENAI_API_KEY is available via os.environ automatically
    client = OpenAI()  # picks up from environment
    return client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': prompt}]
    ).choices[0].message.content
 

Processing from a file or database

import modal
 
# Use a Modal Volume to share input/output files across containers
volume = modal.Volume.from_name('batch-data', create_if_missing=True)
 
@app.function(
    image=image,
    volumes={'/data': volume},
    secrets=[modal.Secret.from_name('openai-secret')],
)
def process_row(row_id: int) -> dict:
    import json
    # Read input from volume
    with open(f'/data/input/{row_id}.json') as f:
        row = json.load(f)
 
    # Process
    from openai import OpenAI
    client = OpenAI()
    result = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': row['text']}]
    ).choices[0].message.content
 
    # Write output to volume
    with open(f'/data/output/{row_id}.json', 'w') as f:
        json.dump({'id': row_id, 'result': result}, f)
 
    return {'id': row_id, 'status': 'done'}
 

Scheduled batch jobs with Cron

# Run batch processing every night at 2am UTC
@app.function(
    image=image,
    schedule=modal.Cron('0 2 * * *'),
    secrets=[modal.Secret.from_name('openai-secret')],
)
def nightly_classification():
    # Fetch new documents from database
    docs = fetch_unprocessed_documents()
 
    # Process in parallel
    results = list(classify_document.map(docs))
 
    # Save results
    save_results_to_db(results)
    print(f'Nightly job: processed {len(results)} documents')
 

Cost comparison: Modal vs self-managed GPU

Scenario Modal Always-on GPU server
1 hour batch job, T4 ~$0.59 Same (but server never idles free)
Daily 30-min job, T4 ~$9/month ~$430/month (always-on)
Occasional inference Pay per call Full server cost regardless
Burst to 100 parallel jobs Scales instantly Need to pre-provision
Modal's economics are excellent for batch jobs and bursty workloads. If you need sub-second latency on continuous traffic (thousands of requests per minute), a dedicated GPU server may be cheaper.

Error handling in batch jobs

@app.local_entrypoint()
def main():
    documents = load_documents()
 
    results = []
    errors = []
 
    # return_exceptions=True prevents one failure from stopping everything
    for i, result in enumerate(classify_document.map(
        documents,
        return_exceptions=True
    )):
        if isinstance(result, Exception):
            errors.append({'index': i, 'error': str(result)})
        else:
            results.append(result)
 
    print(f'Success: {len(results)}, Errors: {len(errors)}')
    if errors:
        print('Failed items:', errors[:5])