Core API Reference

ParallelBatchProcessor

async_batch_llm.ParallelBatchProcessor

ParallelBatchProcessor(max_workers: int | None = None, post_processor: PostProcessorFunc[TOutput, TContext] | None = None, timeout_per_item: float | None = None, rate_limit_cooldown: float | None = None, config: ProcessorConfig | None = None, error_classifier: ErrorClassifier | None = None, rate_limit_strategy: RateLimitStrategy | None = None, middlewares: list[Middleware[TInput, TOutput, TContext]] | None = None, observers: list[ProcessorObserver] | None = None, progress_callback: ProgressCallbackFunc | None = None)

Bases: BatchProcessor[TInput, TOutput, TContext], Generic[TInput, TOutput, TContext]

Batch processor that executes items in parallel as individual agent calls.

This refactored version uses: - Pluggable error classification (provider-agnostic) - Pluggable rate limit strategies - Middleware pipeline for extensibility - Observer pattern for monitoring - Configuration objects for easier setup

Initialize the parallel batch processor.

Parameters:

Name	Type	Description	Default
`max_workers`	`int \| None`	Maximum concurrent workers (deprecated, use config)	`None`
`post_processor`	`PostProcessorFunc[TOutput, TContext] \| None`	Optional async function called after each successful item	`None`
`timeout_per_item`	`float \| None`	Timeout per item in seconds (deprecated, use config)	`None`
`rate_limit_cooldown`	`float \| None`	Cooldown duration (deprecated, use config)	`None`
`config`	`ProcessorConfig \| None`	Processor configuration object (recommended)	`None`
`error_classifier`	`ErrorClassifier \| None`	Strategy for classifying errors (default: DefaultErrorClassifier)	`None`
`rate_limit_strategy`	`RateLimitStrategy \| None`	Strategy for handling rate limits	`None`
`middlewares`	`list[Middleware[TInput, TOutput, TContext]] \| None`	List of middleware to apply	`None`
`observers`	`list[ProcessorObserver] \| None`	List of observers for events	`None`
`progress_callback`	`ProgressCallbackFunc \| None`	Optional callback(completed, total, current_item_id) for progress updates	`None`

Source code in src/async_batch_llm/parallel.py

def __init__(
    self,
    max_workers: int | None = None,
    post_processor: PostProcessorFunc[TOutput, TContext] | None = None,
    timeout_per_item: float | None = None,
    rate_limit_cooldown: float | None = None,  # Deprecated, use config
    # New parameters
    config: ProcessorConfig | None = None,
    error_classifier: ErrorClassifier | None = None,
    rate_limit_strategy: RateLimitStrategy | None = None,
    middlewares: list[Middleware[TInput, TOutput, TContext]] | None = None,
    observers: list[ProcessorObserver] | None = None,
    progress_callback: "ProgressCallbackFunc | None" = None,
):
    """
    Initialize the parallel batch processor.

    Args:
        max_workers: Maximum concurrent workers (deprecated, use config)
        post_processor: Optional async function called after each successful item
        timeout_per_item: Timeout per item in seconds (deprecated, use config)
        rate_limit_cooldown: Cooldown duration (deprecated, use config)
        config: Processor configuration object (recommended)
        error_classifier: Strategy for classifying errors (default: DefaultErrorClassifier)
        rate_limit_strategy: Strategy for handling rate limits
        middlewares: List of middleware to apply
        observers: List of observers for events
        progress_callback: Optional callback(completed, total, current_item_id) for progress updates
    """
    import warnings

    # Emit deprecation warnings for legacy parameters
    if max_workers is not None:
        warnings.warn(
            "The 'max_workers' parameter is deprecated. "
            "Use ProcessorConfig(max_workers=...) instead.",
            DeprecationWarning,
            stacklevel=2,
        )
    if timeout_per_item is not None:
        warnings.warn(
            "The 'timeout_per_item' parameter is deprecated. "
            "Use ProcessorConfig(timeout_per_item=...) instead.",
            DeprecationWarning,
            stacklevel=2,
        )
    if rate_limit_cooldown is not None:
        warnings.warn(
            "The 'rate_limit_cooldown' parameter is deprecated. "
            "Use ProcessorConfig(rate_limit=RateLimitConfig(cooldown_seconds=...)) instead.",
            DeprecationWarning,
            stacklevel=2,
        )

    # Handle backward compatibility
    if config is None:
        from .core import RateLimitConfig

        config = ProcessorConfig(
            max_workers=max_workers or 5,
            timeout_per_item=timeout_per_item or 120.0,
            rate_limit=RateLimitConfig(cooldown_seconds=rate_limit_cooldown or 300.0),
        )
    else:
        # Override config with explicit legacy parameters if provided — but
        # build a NEW config via dataclasses.replace rather than mutating the
        # caller's object (and its nested RateLimitConfig). Callers may reuse
        # the same ProcessorConfig across processors and shouldn't see it
        # silently rewritten under them.
        import dataclasses

        overrides: dict = {}
        if max_workers is not None:
            overrides["max_workers"] = max_workers
        if timeout_per_item is not None:
            overrides["timeout_per_item"] = timeout_per_item
        if rate_limit_cooldown is not None:
            overrides["rate_limit"] = dataclasses.replace(
                config.rate_limit, cooldown_seconds=rate_limit_cooldown
            )
        if overrides:
            config = dataclasses.replace(config, **overrides)

    config.validate()

    super().__init__(
        config.max_workers,
        post_processor,
        max_queue_size=config.max_queue_size,
        progress_callback=progress_callback,
        progress_callback_timeout=config.progress_callback_timeout,
    )
    self.config = config

    # Diagnostic: high max_workers can outrun the OS open-file limit.
    _warn_if_fd_limit_low(config.max_workers)

    # Set up strategies. When the caller didn't pass an explicit
    # error_classifier, it's auto-selected from the work items' strategies at
    # batch start (see _resolve_error_classifier); until then we hold a
    # DefaultErrorClassifier so the processor is always usable.
    self._user_supplied_classifier = error_classifier is not None
    self.error_classifier: ErrorClassifier = error_classifier or DefaultErrorClassifier()
    self._recommended_classifiers: list[ErrorClassifier] = []
    self._classifier_resolved = self._user_supplied_classifier
    self.rate_limit_strategy = rate_limit_strategy or ExponentialBackoffStrategy(
        initial_cooldown=config.rate_limit.cooldown_seconds,
        max_cooldown=config.rate_limit.max_cooldown_seconds,
        backoff_multiplier=config.rate_limit.backoff_multiplier,
        slow_start_items=config.rate_limit.slow_start_items,
        slow_start_initial_delay=config.rate_limit.slow_start_initial_delay,
        slow_start_final_delay=config.rate_limit.slow_start_final_delay,
    )

    # Set up middleware and observers
    self.middlewares = middlewares or []
    self.observers = observers or []

    # Event + middleware dispatch. Delegates observer emits and the
    # middleware chain (before/after/on_error) to a stateless helper.
    self._events: EventDispatcher[TInput, TOutput, TContext] = EventDispatcher(
        observers=self.observers, middlewares=self.middlewares
    )

    # Rate-limit coordination (extracted in v0.7.0).
    self._rate_limit_coord = RateLimitCoordinator(
        rate_limit_strategy=self.rate_limit_strategy,
        events=self._events,
    )
    # Back-compat aliases — existing private methods and some tests
    # reach into these attributes directly.
    self._rate_limit_event = self._rate_limit_coord._rate_limit_event
    self._current_generation_event = self._rate_limit_coord._current_generation_event
    self._rate_limit_lock = self._rate_limit_coord._lock

    # Thread-safety locks (_stats_lock / _results_lock) live on the base
    # class so both batch and streaming modes share them.

    # Strategy lifecycle management (v0.2.0, extracted in v0.7.0).
    # Tracks prepared strategies via a WeakSet so sharing one instance
    # across work items invokes prepare() exactly once.
    self._strategy_lifecycle: StrategyLifecycle[TOutput] = StrategyLifecycle()
    # Back-compat aliases used by existing private methods and tests.
    self._prepared_strategies = self._strategy_lifecycle._prepared
    self._strategy_lock = self._strategy_lifecycle._lock

    # Proactive rate limiting (prevents hitting rate limits)
    if config.max_requests_per_minute:
        from aiolimiter import AsyncLimiter

        # aiolimiter doesn't have explicit burst_size - it uses max_rate as burst capacity
        # To support burst_size, we'd need to use max_rate + burst_size
        # For now, we use max_rate directly (no additional burst)
        self._proactive_rate_limiter: AsyncLimiter | None = AsyncLimiter(
            max_rate=config.max_requests_per_minute,
            time_period=60,  # per minute
        )
    else:
        self._proactive_rate_limiter = None

    # Centralized token-usage extraction across all exception shapes.
    self._token_extractor = TokenExtractor()

    # Per-item execution engine (extracted so the single-call helper and the
    # gateway can share the exact same retry/rate-limit/token pipeline). The
    # processor is one host for the executor; it reads its deps live from
    # `self`, so it must be built here, before start(), and the worker keeps
    # calling the instance methods below (which delegate to it) so tests that
    # monkeypatch them still take effect.
    self._executor: ItemExecutor[TInput, TOutput, TContext] = ItemExecutor(self)

aexit `async`

__aexit__(exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None) -> bool

Context manager exit - ensures cleanup of strategies and resources.

Calls cleanup() on all prepared strategies, then delegates to parent cleanup.

Parameters:

Name	Type	Description	Default
`exc_type`	`type[BaseException] \| None`	Exception type (if any exception occurred)	required
`exc_val`	`BaseException \| None`	Exception value (if any exception occurred)	required
`exc_tb`	`TracebackType \| None`	Exception traceback (if any exception occurred)	required

Returns:

Type	Description
`bool`	False to indicate exceptions should not be suppressed

Source code in src/async_batch_llm/parallel.py

async def __aexit__(
    self,
    exc_type: type[BaseException] | None,
    exc_val: BaseException | None,
    exc_tb: "TracebackType | None",
) -> bool:
    """
    Context manager exit - ensures cleanup of strategies and resources.

    Calls cleanup() on all prepared strategies, then delegates to parent cleanup.

    Args:
        exc_type: Exception type (if any exception occurred)
        exc_val: Exception value (if any exception occurred)
        exc_tb: Exception traceback (if any exception occurred)

    Returns:
        False to indicate exceptions should not be suppressed
    """
    await self._cleanup_strategies()

    # Call parent cleanup to handle workers and queue
    await self.cleanup()
    return False  # Don't suppress exceptions

add_work `async`

add_work(work_item: LLMWorkItem[TInput, TOutput, TContext]) -> None

Queue a work item, recording its strategy's classifier recommendation.

Extends the base queueing with the bookkeeping needed to auto-select an error classifier at batch start when the caller didn't pass one. A no-op recommendation (custom strategies returning None) is ignored.

Source code in src/async_batch_llm/parallel.py

async def add_work(self, work_item: LLMWorkItem[TInput, TOutput, TContext]) -> None:
    """Queue a work item, recording its strategy's classifier recommendation.

    Extends the base queueing with the bookkeeping needed to auto-select an
    error classifier at batch start when the caller didn't pass one. A
    no-op recommendation (custom strategies returning ``None``) is ignored.
    """
    await super().add_work(work_item)
    if self._user_supplied_classifier or self._classifier_resolved:
        return
    try:
        recommended = work_item.strategy.recommended_error_classifier()
    except Exception:
        # A buggy override must never break queueing — just skip the hint.
        recommended = None
    if recommended is not None:
        self._recommended_classifiers.append(recommended)
        # Streaming mode has no "batch start" barrier and workers are
        # already running, so resolve eagerly from the first recommendation
        # rather than waiting to collect every item's (we can't).
        if self._streaming:
            self._resolve_error_classifier()

get_stats `async`

get_stats() -> dict

Get processor statistics (thread-safe).

Returns:

Type	Description
`dict`	Dictionary containing processing statistics including:
`dict`	processed: Number of items processed
`dict`	succeeded: Number of successful items
`dict`	failed: Number of failed items
`dict`	rate_limit_count: Number of rate limit errors encountered
`dict`	error_counts: Dictionary of error types and their counts
`dict`	total: Total number of items queued
`dict`	start_time: Timestamp when processing started

Source code in src/async_batch_llm/parallel.py

async def get_stats(self) -> dict:
    """
    Get processor statistics (thread-safe).

    Returns:
        Dictionary containing processing statistics including:
        - processed: Number of items processed
        - succeeded: Number of successful items
        - failed: Number of failed items
        - rate_limit_count: Number of rate limit errors encountered
        - error_counts: Dictionary of error types and their counts
        - total: Total number of items queued
        - start_time: Timestamp when processing started
    """
    async with self._stats_lock:
        return self._stats.copy()

shutdown `async`

shutdown()

Clean up resources: flush observers and cancel pending tasks.

Source code in src/async_batch_llm/parallel.py

async def shutdown(self):
    """Clean up resources: flush observers and cancel pending tasks."""
    await self._cleanup_strategies()
    await self.cleanup()

LLMWorkItem

async_batch_llm.LLMWorkItem `dataclass`

LLMWorkItem(item_id: str, strategy: LLMCallStrategy[TOutput], prompt: str = '', context: TContext | None = None)

Bases: Generic[TInput, TOutput, TContext]

Represents a single work item to be processed by an LLM strategy.

Attributes:

Name	Type	Description
`item_id`	`str`	Unique identifier for this work item
`strategy`	`LLMCallStrategy[TOutput]`	LLM call strategy that encapsulates how to make the LLM call
`prompt`	`str`	The prompt/input to pass to the LLM
`context`	`TContext \| None`	Optional context data passed through to results/post-processor

__post_init__

__post_init__()

Validate work item fields.

Source code in src/async_batch_llm/base.py

def __post_init__(self):
    """Validate work item fields."""
    if not self.item_id or not isinstance(self.item_id, str):
        raise ValueError(
            f"item_id must be a non-empty string (got {type(self.item_id).__name__}: {repr(self.item_id)}). "
            f"Provide a unique string identifier for this work item."
        )
    if not self.item_id.strip():
        raise ValueError(
            f"item_id cannot be whitespace only (got {repr(self.item_id)}). "
            f"Provide a non-whitespace string identifier."
        )
    if self.strategy is None:
        raise ValueError(
            "strategy must not be None. "
            "Pass an LLMCallStrategy instance (e.g., PydanticAIStrategy, GeminiStrategy, "
            "or your custom subclass)."
        )
    if not isinstance(self.prompt, str):
        raise TypeError(
            f"prompt must be a string (got {type(self.prompt).__name__}: {repr(self.prompt)[:80]}). "
            f"If you need to pass structured data, serialize it to a string first."
        )

WorkItemResult

async_batch_llm.WorkItemResult `dataclass`

WorkItemResult(item_id: str, success: bool, output: TOutput | None = None, error: str | None = None, context: TContext | None = None, token_usage: TokenUsage = (lambda: {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0})(), metadata: dict[str, Any] | None = None, gemini_safety_ratings: dict[str, str] | None = None, exception: Exception | None = None)

Bases: ProviderOutputViews, Generic[TOutput, TContext]

Result of processing a single work item.

Attributes:

Name	Type	Description
`item_id`	`str`	ID of the work item
`success`	`bool`	Whether processing succeeded
`output`	`TOutput \| None`	Agent output if successful, None if failed
`error`	`str \| None`	Error message if failed, None if successful
`context`	`TContext \| None`	Context data from the work item
`token_usage`	`TokenUsage`	Token usage stats (input_tokens, output_tokens, total_tokens)
`metadata`	`dict[str, Any] \| None`	Provider-specific metadata returned alongside the response — e.g. `{"provider": "Anthropic", "finish_reason": "stop", "model": "anthropic/claude-haiku-4-5"}`. Populated when the strategy returns a 3-tuple `(output, tokens, metadata)` from `execute()`; `None` for legacy 2-tuple strategies. The keys `'grounding'`, `'reasoning'`, `'tool_calls'`, and `'logprobs'` are reserved, with documented dict shapes readable through the typed views `.grounding`/`.reasoning`/ `.tool_calls`/`.logprobs` (see `provider_output.py`). Added in v0.10.0. (For Gemini safety ratings specifically, this replaces the older `gemini_safety_ratings` field — see below.)
`gemini_safety_ratings`	`dict[str, str] \| None`	Deprecated. Use `metadata['safety_ratings']` instead. Still populated when the underlying model surfaces them, for backward compat. To be removed in a future release.
`exception`	`Exception \| None`	The originating exception for a failed result, when one was raised (all retries exhausted, or a permanent non-retryable error). `None` for successes and for non-error outcomes such as a middleware filter-skip. `call()` / `LLMGateway.submit()` re-raise this exact exception (preserving the provider's type) rather than a generic `LLMCallError`. Its traceback is detached before storage (the full failure is already logged at the failure site) so accumulated failed results don't pin frame locals; a re-raise gets a fresh traceback. Excluded from equality so two failed results with distinct exception instances still compare equal.

__post_init__

__post_init__()

Backfill gemini_safety_ratings from metadata['safety_ratings'] for backward compatibility. Once gemini_safety_ratings is removed, this method goes away. (Reads/writes go through __dict__ directly so the framework itself never triggers the deprecation warning.)

Source code in src/async_batch_llm/base.py

def __post_init__(self):
    """Backfill ``gemini_safety_ratings`` from ``metadata['safety_ratings']``
    for backward compatibility. Once ``gemini_safety_ratings`` is removed,
    this method goes away. (Reads/writes go through ``__dict__`` directly
    so the framework itself never triggers the deprecation warning.)
    """
    if (
        self.__dict__.get("gemini_safety_ratings") is None
        and self.metadata is not None
        and "safety_ratings" in self.metadata
    ):
        ratings = self.metadata["safety_ratings"]
        if isinstance(ratings, dict):
            self.__dict__["gemini_safety_ratings"] = ratings

ProcessorConfig

async_batch_llm.ProcessorConfig `dataclass`

ProcessorConfig(max_workers: int = 5, timeout_per_item: float = 120.0, post_processor_timeout: float = 90.0, concurrent_post_processing: bool = False, retry: RetryConfig = RetryConfig(), rate_limit: RateLimitConfig = RateLimitConfig(), max_requests_per_minute: float | None = None, progress_interval: int = 10, progress_callback_timeout: float | None = 5.0, enable_detailed_logging: bool = False, max_queue_size: int = 0, dry_run: bool = False)

Complete configuration for batch processor.

__post_init__

__post_init__() -> None

Validate configuration on construction.

Source code in src/async_batch_llm/core/config.py

def __post_init__(self) -> None:
    """Validate configuration on construction."""
    self.validate()

validate

validate() -> None

Validate complete configuration.

Source code in src/async_batch_llm/core/config.py

def validate(self) -> None:
    """Validate complete configuration."""
    if self.max_workers < 1:
        raise ValueError(
            f"max_workers must be >= 1 (got {self.max_workers}). "
            f"Set config.max_workers to a positive integer (typical: 5-20)."
        )
    if self.timeout_per_item <= 0:
        raise ValueError(
            f"timeout_per_item must be > 0 (got {self.timeout_per_item}). "
            f"Set config.timeout_per_item to a positive number in seconds (typical: 60-300)."
        )
    if self.post_processor_timeout <= 0:
        raise ValueError(
            f"post_processor_timeout must be > 0 (got {self.post_processor_timeout}). "
            f"Set config.post_processor_timeout to a positive number in seconds (typical: 30-120)."
        )
    if self.progress_interval < 1:
        raise ValueError(
            f"progress_interval must be >= 1 (got {self.progress_interval}). "
            f"Set config.progress_interval to a positive integer."
        )
    if self.progress_callback_timeout is not None and self.progress_callback_timeout <= 0:
        raise ValueError(
            f"progress_callback_timeout must be > 0 (got {self.progress_callback_timeout}). "
            f"Set config.progress_callback_timeout to None to disable or a positive number of seconds."
        )
    if self.max_queue_size < 0:
        raise ValueError(
            f"max_queue_size must be >= 0 (got {self.max_queue_size}). "
            f"Set config.max_queue_size to 0 for unlimited, or a positive number to limit queue size."
        )
    if self.max_requests_per_minute is not None and self.max_requests_per_minute <= 0:
        raise ValueError(
            f"max_requests_per_minute must be > 0 or None (got {self.max_requests_per_minute}). "
            f"Set config.max_requests_per_minute to None to disable proactive rate limiting, "
            f"or a positive number (typical: 10-500 requests/minute)."
        )

    # Validate nested configs first
    self.retry.validate()
    self.rate_limit.validate()

    # Cross-field validations
    if self.max_queue_size > 0 and self.max_queue_size < self.max_workers:
        logger.warning(
            f"max_queue_size ({self.max_queue_size}) is less than max_workers ({self.max_workers}). "
            f"This may cause workers to starve waiting for work. "
            f"Consider setting max_queue_size >= max_workers or 0 for unlimited."
        )

    # Note: timeout_per_item is a PER-ATTEMPT limit enforced around each
    # strategy.execute() call; between-attempt retry waits happen outside
    # it, so no cross-validation between the two is meaningful. (Earlier
    # versions warned when timeout_per_item was smaller than cumulative
    # retry waits — that comparison was conceptually wrong and confusing.)

    # Validate proactive rate limit vs workers
    if self.max_requests_per_minute is not None:
        requests_per_second = self.max_requests_per_minute / 60.0
        if requests_per_second < self.max_workers:
            logger.warning(
                f"max_requests_per_minute ({self.max_requests_per_minute}) is less than "
                f"max_workers ({self.max_workers}). "
                f"At {requests_per_second:.2f} requests/second with {self.max_workers} workers, "
                f"workers may frequently wait for rate limit tokens. "
                f"Consider reducing max_workers to {int(requests_per_second)} or increasing "
                f"max_requests_per_minute."
            )

BatchResult

async_batch_llm.BatchResult `dataclass`

BatchResult(results: list[WorkItemResult[TOutput, TContext]])

Bases: Generic[TOutput, TContext]

Result of processing a batch of work items.

Attributes:

Name	Type	Description
`results`	`list[WorkItemResult[TOutput, TContext]]`	Individual work item results, in completion order — the order items finished, which (with parallel workers, retries, and rate-limit cooldowns) is generally NOT the order they were added. Use :meth:`by_id` to look results up by `item_id`.
`total_items`	`int`	Total number of items in the batch
`succeeded`	`int`	Number of successful items
`failed`	`int`	Number of failed items
`total_input_tokens`	`int`	Sum of input tokens across all items
`total_output_tokens`	`int`	Sum of output tokens across all items
`total_cached_tokens`	`int`	Sum of cached input tokens across all items (v0.2.0)

failures `property`

failures: list[WorkItemResult[TOutput, TContext]]

The failed results only, in completion order.

successes `property`

successes: list[WorkItemResult[TOutput, TContext]]

The successful results only, in completion order.

__post_init__

__post_init__()

Calculate summary statistics from results.

Source code in src/async_batch_llm/base.py

def __post_init__(self):
    """Calculate summary statistics from results."""
    self.total_items = len(self.results)
    self.succeeded = sum(1 for r in self.results if r.success)
    self.failed = sum(1 for r in self.results if not r.success)
    self.total_input_tokens = sum(r.token_usage.get("input_tokens", 0) for r in self.results)
    self.total_output_tokens = sum(r.token_usage.get("output_tokens", 0) for r in self.results)
    # v0.2.0: Aggregate cached tokens
    self.total_cached_tokens = sum(
        r.token_usage.get("cached_input_tokens", 0) for r in self.results
    )

by_id

by_id() -> dict[str, WorkItemResult[TOutput, TContext]]

Map item_id -> result for direct lookup.

Results are ordered by completion, so use this when you need to align outputs back to specific inputs. If two results somehow share an item_id, the later-completed one wins.

Source code in src/async_batch_llm/base.py

def by_id(self) -> dict[str, WorkItemResult[TOutput, TContext]]:
    """Map ``item_id`` -> result for direct lookup.

    Results are ordered by completion, so use this when you need to align
    outputs back to specific inputs. If two results somehow share an
    ``item_id``, the later-completed one wins.
    """
    return {r.item_id: r for r in self.results}

cache_hit_rate

cache_hit_rate() -> float

Calculate cache hit rate as percentage of input tokens that were cached.

Returns:

Type	Description
`float`	Percentage (0.0 to 100.0) of input tokens served from cache

Source code in src/async_batch_llm/base.py

def cache_hit_rate(self) -> float:
    """
    Calculate cache hit rate as percentage of input tokens that were cached.

    Returns:
        Percentage (0.0 to 100.0) of input tokens served from cache
    """
    if self.total_input_tokens == 0:
        return 0.0
    return (self.total_cached_tokens / self.total_input_tokens) * 100.0

effective_input_tokens

effective_input_tokens(cached_token_rate: float | None = None) -> int

Estimate billable input tokens after the cache discount.

cached_token_rate is the fraction of the normal input-token price you pay for tokens served from cache. For example, Gemini charges 10% of the normal price (rate = 0.10), so 1000 cached tokens cost the same as 100 uncached tokens.

Use the named constants on :class:CachedTokenRates to avoid hardcoding magic numbers:

.. code-block:: python

result.effective_input_tokens(CachedTokenRates.OPENAI)
result.effective_input_tokens(CachedTokenRates.GEMINI)

Parameters:

Name	Type	Description	Default
`cached_token_rate`	`float \| None`	Fraction (0.0–1.0) of the normal input price paid for cached tokens. When omitted (`None`) it defaults to `CachedTokenRates.GEMINI` (0.10) for backward compatibility — pre-v0.9.0 versions hardcoded this value. Pass an explicit rate when working with non-Gemini providers to get accurate numbers; relying on the implicit default while cached tokens are present emits a `UserWarning`, since the Gemini rate is wrong for e.g. OpenAI (~0.50).	`None`

Returns:

Type	Description
`int`	Effective input tokens billed. The discount is computed by
`int`	truncating `cached_tokens * (1 - rate)` toward zero with
`int`	`int()`, which means the returned billable estimate is
`int`	rounded up when the discount would have a fractional
`int`	part — a deliberately conservative choice for cost reporting
`int`	(your real bill is at most this number, never more).

Raises:

Type	Description
`ValueError`	If `cached_token_rate` is not in [0.0, 1.0].

Source code in src/async_batch_llm/base.py

def effective_input_tokens(self, cached_token_rate: float | None = None) -> int:
    """
    Estimate billable input tokens after the cache discount.

    ``cached_token_rate`` is the fraction of the normal input-token price
    you pay for tokens served from cache. For example, Gemini charges
    10% of the normal price (rate = 0.10), so 1000 cached tokens cost
    the same as 100 uncached tokens.

    Use the named constants on :class:`CachedTokenRates` to avoid
    hardcoding magic numbers:

    .. code-block:: python

        result.effective_input_tokens(CachedTokenRates.OPENAI)
        result.effective_input_tokens(CachedTokenRates.GEMINI)

    Args:
        cached_token_rate: Fraction (0.0–1.0) of the normal input price
            paid for cached tokens. When omitted (``None``) it defaults to
            ``CachedTokenRates.GEMINI`` (0.10) for backward compatibility —
            pre-v0.9.0 versions hardcoded this value. **Pass an explicit
            rate when working with non-Gemini providers** to get accurate
            numbers; relying on the implicit default while cached tokens
            are present emits a ``UserWarning``, since the Gemini rate is
            wrong for e.g. OpenAI (~0.50).

    Returns:
        Effective input tokens billed. The discount is computed by
        truncating ``cached_tokens * (1 - rate)`` toward zero with
        ``int()``, which means the returned billable estimate is
        rounded **up** when the discount would have a fractional
        part — a deliberately conservative choice for cost reporting
        (your real bill is at most this number, never more).

    Raises:
        ValueError: If ``cached_token_rate`` is not in [0.0, 1.0].
    """
    if cached_token_rate is None:
        # Implicit default. Only nudge when it actually changes the answer
        # (i.e. there are cached tokens to discount) — silent for the common
        # no-cache case so we don't cry wolf.
        if self.total_cached_tokens > 0:
            import warnings

            warnings.warn(
                "effective_input_tokens() called without an explicit "
                "cached_token_rate; defaulting to the Gemini rate "
                "(CachedTokenRates.GEMINI = 0.10). This is wrong for other "
                "providers (OpenAI is ~0.50). Pass an explicit "
                "CachedTokenRates constant to silence this warning.",
                UserWarning,
                stacklevel=2,
            )
        cached_token_rate = CachedTokenRates.GEMINI

    if not 0.0 <= cached_token_rate <= 1.0:
        raise ValueError(
            f"cached_token_rate must be in [0.0, 1.0]; got {cached_token_rate}. "
            f"This is the fraction of normal price paid for cached tokens "
            f"(0.0 = free, 1.0 = no discount). For named provider rates, "
            f"use CachedTokenRates."
        )
    # cached_token_rate is what you PAY; (1 - rate) is the discount.
    # int() floors the discount toward zero -> conservative (over-)estimate
    # of effective billable tokens. See the Returns docstring.
    discount = int(self.total_cached_tokens * (1.0 - cached_token_rate))
    return self.total_input_tokens - discount

estimated_cost

estimated_cost(input_per_mtok: float, output_per_mtok: float, cached_token_rate: float | None = None) -> float

Estimate total spend from per-million-token prices.

Applies the cache discount to input tokens via :meth:effective_input_tokens, so cached tokens are billed at their reduced rate.

Parameters:

Name	Type	Description	Default
`input_per_mtok`	`float`	Price per 1,000,000 input tokens (in your currency).	required
`output_per_mtok`	`float`	Price per 1,000,000 output tokens.	required
`cached_token_rate`	`float \| None`	Fraction of the normal input price paid for cached tokens (see :class:`CachedTokenRates`). When `None` it defaults to the Gemini rate and emits a `UserWarning` if cached tokens are present — pass an explicit rate for other providers.	`None`

Returns:

Type	Description
`float`	Estimated total cost: ``effective_input / 1e6 * input_per_mtok +
`float`	output / 1e6 * output_per_mtok``.

Source code in src/async_batch_llm/base.py

def estimated_cost(
    self,
    input_per_mtok: float,
    output_per_mtok: float,
    cached_token_rate: float | None = None,
) -> float:
    """Estimate total spend from per-million-token prices.

    Applies the cache discount to input tokens via
    :meth:`effective_input_tokens`, so cached tokens are billed at their
    reduced rate.

    Args:
        input_per_mtok: Price per 1,000,000 input tokens (in your currency).
        output_per_mtok: Price per 1,000,000 output tokens.
        cached_token_rate: Fraction of the normal input price paid for
            cached tokens (see :class:`CachedTokenRates`). When ``None`` it
            defaults to the Gemini rate and emits a ``UserWarning`` if cached
            tokens are present — pass an explicit rate for other providers.

    Returns:
        Estimated total cost: ``effective_input / 1e6 * input_per_mtok +
        output / 1e6 * output_per_mtok``.
    """
    billable_input = self.effective_input_tokens(cached_token_rate)
    input_cost = billable_input / 1_000_000 * input_per_mtok
    output_cost = self.total_output_tokens / 1_000_000 * output_per_mtok
    return input_cost + output_cost

Grounding

async_batch_llm.Grounding `dataclass`

Grounding(sources: list[GroundingSource] = list(), queries: list[str] = list(), supports: list[dict[str, Any]] = list())

Web-grounding data from a grounded call (e.g. Gemini google_search).

Attributes:

Name	Type	Description
`sources`	`list[GroundingSource]`	The web sources the answer was grounded in.
`queries`	`list[str]`	Search queries the model issued (`web_search_queries`).
`supports`	`list[dict[str, Any]]`	Answer-span → source-index links, as plain dicts (`{"text", "start_index", "end_index", "chunk_indices"}`). Kept untyped for now.

from_metadata `classmethod`

from_metadata(data: Any) -> Grounding | None

Parse a metadata['grounding'] dict; lenient, never raises.

Source code in src/async_batch_llm/provider_output.py

@classmethod
def from_metadata(cls, data: Any) -> Grounding | None:
    """Parse a ``metadata['grounding']`` dict; lenient, never raises."""
    if not isinstance(data, dict) or not data:
        return None
    raw_sources = data.get("sources")
    sources = [
        parsed
        for entry in (raw_sources if isinstance(raw_sources, list) else ())
        if (parsed := GroundingSource.from_metadata(entry)) is not None
    ]
    raw_queries = data.get("queries")
    queries = [
        q for q in (raw_queries if isinstance(raw_queries, list) else ()) if isinstance(q, str)
    ]
    raw_supports = data.get("supports")
    supports = [
        s
        for s in (raw_supports if isinstance(raw_supports, list) else ())
        if isinstance(s, dict)
    ]
    if not sources and not queries and not supports:
        return None
    return cls(sources=sources, queries=queries, supports=supports)

GroundingSource

async_batch_llm.GroundingSource `dataclass`

GroundingSource(uri: str, title: str | None = None, snippet: str | None = None)

One web source backing a grounded response.

Attributes:

Name	Type	Description
`uri`	`str`	Source URL (always present; entries without one are dropped).
`title`	`str \| None`	Human-readable page title, when the provider supplied one.
`snippet`	`str \| None`	Excerpt from the source, when the provider supplied one.

from_metadata `classmethod`

from_metadata(data: Any) -> GroundingSource | None

Parse one metadata['grounding']['sources'] entry; lenient, never raises.

Source code in src/async_batch_llm/provider_output.py

@classmethod
def from_metadata(cls, data: Any) -> GroundingSource | None:
    """Parse one ``metadata['grounding']['sources']`` entry; lenient, never raises."""
    if not isinstance(data, dict):
        return None
    uri = data.get("uri")
    if not isinstance(uri, str) or not uri:
        return None
    title = data.get("title")
    snippet = data.get("snippet")
    return cls(
        uri=uri,
        title=title if isinstance(title, str) else None,
        snippet=snippet if isinstance(snippet, str) else None,
    )

ToolCall

async_batch_llm.ToolCall `dataclass`

ToolCall(id: str | None, name: str, arguments: str)

One tool/function call the model requested. Visibility only — the framework never executes tools; feed these to your own dispatch loop.

Attributes:

Name	Type	Description
`id`	`str \| None`	Provider call id, when supplied.
`name`	`str`	Tool/function name (always present; entries without one are dropped).
`arguments`	`str`	The raw JSON-string arguments, deliberately unparsed — parse with `json.loads` (and validate) yourself.

from_metadata `classmethod`

from_metadata(data: Any) -> ToolCall | None

Parse one metadata['tool_calls'] entry; lenient, never raises.

Source code in src/async_batch_llm/provider_output.py

@classmethod
def from_metadata(cls, data: Any) -> ToolCall | None:
    """Parse one ``metadata['tool_calls']`` entry; lenient, never raises."""
    if not isinstance(data, dict):
        return None
    name = data.get("name")
    if not isinstance(name, str) or not name:
        return None
    call_id = data.get("id")
    arguments = data.get("arguments")
    return cls(
        id=call_id if isinstance(call_id, str) else None,
        name=name,
        arguments=arguments if isinstance(arguments, str) else "",
    )

Core API Reference

ParallelBatchProcessor

async_batch_llm.ParallelBatchProcessor

__aexit__ async

add_work async

get_stats async

shutdown async

LLMWorkItem

async_batch_llm.LLMWorkItem dataclass

__post_init__

WorkItemResult

async_batch_llm.WorkItemResult dataclass

__post_init__

ProcessorConfig

async_batch_llm.ProcessorConfig dataclass

__post_init__

validate

BatchResult

async_batch_llm.BatchResult dataclass

failures property

successes property

__post_init__

by_id

cache_hit_rate

effective_input_tokens

estimated_cost

Grounding

async_batch_llm.Grounding dataclass

from_metadata classmethod

GroundingSource

async_batch_llm.GroundingSource dataclass

from_metadata classmethod

ToolCall

async_batch_llm.ToolCall dataclass

from_metadata classmethod

aexit `async`

add_work `async`

get_stats `async`

shutdown `async`

async_batch_llm.LLMWorkItem `dataclass`

async_batch_llm.WorkItemResult `dataclass`

async_batch_llm.ProcessorConfig `dataclass`

async_batch_llm.BatchResult `dataclass`

failures `property`

successes `property`

async_batch_llm.Grounding `dataclass`

from_metadata `classmethod`

async_batch_llm.GroundingSource `dataclass`

from_metadata `classmethod`

async_batch_llm.ToolCall `dataclass`

from_metadata `classmethod`