Cost Tracking

run_with_response() vs run()

run() returns a plain string. run_with_response() returns an AgentResponse that includes the text reply, every tool call made, and token/cost metadata from the LiteLLM response.

from cyclops import Agent, AgentConfig

config = AgentConfig(model="gpt-4o-mini")
agent = Agent(config)

result = agent.run_with_response("Summarise the theory of relativity in two sentences.")

print(result.content)
print(f"Model      : {result.model}")
print(f"Prompt     : {result.prompt_tokens} tokens")
print(f"Completion : {result.completion_tokens} tokens")
print(f"Total      : {result.tokens_used} tokens")
print(f"Cost       : ${result.cost:.6f}")

The async version works identically:

import asyncio


async def main():
    result = await agent.arun_with_response("Explain neural networks briefly.")
    print(result.content)
    print(f"Cost: ${result.cost:.6f}")


asyncio.run(main())

AgentResponse fields

Field	Type	Description
`content`	`str`	The final text response from the agent.
`tool_calls`	`List[ToolCall]`	Every tool call made during the run, in order.
`model`	`str`	The model identifier used (from `AgentConfig.model`).
`tokens_used`	`int` or `None`	Total tokens consumed (prompt + completion). `None` if unavailable.
`prompt_tokens`	`int` or `None`	Tokens in the prompt/context sent to the model.
`completion_tokens`	`int` or `None`	Tokens in the generated response.
`cost`	`float` or `None`	Estimated USD cost from LiteLLM’s pricing table. `None` for self-hosted models.

Each ToolCall in tool_calls has:

Field	Type	Description
`id`	`str`	Unique call identifier assigned by the LLM.
`name`	`str`	Name of the tool that was called.
`arguments`	`dict`	Arguments the model passed to the tool.
`result`	`Any` or `None`	The value returned by the tool.

Accumulating cost across multiple calls

Track totals across many runs with a simple dataclass:

import asyncio
from dataclasses import dataclass, field
from typing import List
from cyclops import Agent, AgentConfig
from cyclops.core.types import AgentResponse


@dataclass
class UsageTracker:
    total_cost: float = 0.0
    total_tokens: int = 0
    runs: List[AgentResponse] = field(default_factory=list)

    def record(self, response: AgentResponse) -> None:
        self.runs.append(response)
        self.total_cost += response.cost or 0.0
        self.total_tokens += response.tokens_used or 0

    def report(self) -> None:
        print(f"Runs         : {len(self.runs)}")
        print(f"Total cost   : ${self.total_cost:.4f}")
        print(f"Total tokens : {self.total_tokens:,}")


tracker = UsageTracker()
config = AgentConfig(model="gpt-4o-mini")
agent = Agent(config)

questions = [
    "What is photosynthesis?",
    "Explain DNS in one sentence.",
    "What is the boiling point of water in Celsius?",
]

for q in questions:
    result = agent.run_with_response(q)
    tracker.record(result)
    print(f"Q: {q}")
    print(f"A: {result.content}")
    print(f"   cost={result.cost:.6f}  tokens={result.tokens_used}\n")

tracker.report()

Inspecting tool calls

When tools are used, each invocation is recorded in result.tool_calls:

from cyclops.toolkit import tool


@tool
def lookup(symbol: str) -> str:
    """Look up a stock price."""
    return "189.50"


config = AgentConfig(model="gpt-4o-mini")
agent = Agent(config, tools=[lookup])

result = agent.run_with_response("What is the current price of AAPL?")

print(result.content)
for tc in result.tool_calls:
    print(f"  Called: {tc.name}({tc.arguments}) -> {tc.result}")

Full example

"""Cost and token tracking example: run_with_response() / arun_with_response()

Demonstrates:
  1. Inspecting AgentResponse metadata (tokens_used, cost, prompt_tokens,
     completion_tokens) from a single synchronous run
  2. Async variant with arun_with_response()
  3. Accumulating cost and tokens across multiple calls to track a session budget

Cost data is powered by LiteLLM's built-in pricing table.  The numbers are
accurate for cloud APIs (OpenAI, Anthropic, etc.) but will be None for
local/free models like Ollama since they have no billing cost.

Default model is gpt-4o-mini so the cost fields are populated with real values.
Swap MODEL below if you only have a local Ollama instance available.
"""

import asyncio

from cyclops import Agent, AgentConfig, AgentResponse

# ---------------------------------------------------------------------------
# Model configuration
# ---------------------------------------------------------------------------

# gpt-4o-mini gives real cost data (requires OPENAI_API_KEY)
MODEL = "gpt-4o-mini"

# For a free/local alternative, swap to:
#   MODEL = "ollama/qwen3:4b"
# Note: cost will be None for Ollama: token counts still work.

# Other cloud alternatives with cost data:
#   MODEL = "claude-3-haiku-20240307"    # ANTHROPIC_API_KEY
#   MODEL = "groq/llama-3.1-8b-instant"  # GROQ_API_KEY (cost is ~$0 but tracked)


# ---------------------------------------------------------------------------
# Helper: pretty-print an AgentResponse
# ---------------------------------------------------------------------------


def print_response(label: str, response: AgentResponse) -> None:
    print(f"\n--- {label} ---")
    print(
        f"Content          : {response.content[:120]}{'...' if len(response.content) > 120 else ''}"
    )
    print(f"Model            : {response.model}")
    print(f"Prompt tokens    : {response.prompt_tokens}")
    print(f"Completion tokens: {response.completion_tokens}")
    print(f"Total tokens     : {response.tokens_used}")
    if response.cost is not None:
        print(f"Cost             : ${response.cost:.6f}")
    else:
        print("Cost             : N/A (local model or no pricing data)")
    if response.tool_calls:
        print(f"Tool calls       : {[tc.name for tc in response.tool_calls]}")


# ---------------------------------------------------------------------------
# 1. Single synchronous call
# ---------------------------------------------------------------------------


def demo_single_run() -> AgentResponse:
    print("=" * 60)
    print("1. Single run_with_response() call")
    print("=" * 60)

    config = AgentConfig(
        model=MODEL,
        system_prompt="You are a concise assistant. Keep answers brief.",
        temperature=0.3,
    )
    agent = Agent(config)

    response = agent.run_with_response("What is the difference between RAM and ROM?")
    print_response("run_with_response", response)
    return response


# ---------------------------------------------------------------------------
# 2. Single async call
# ---------------------------------------------------------------------------


async def demo_async_run() -> AgentResponse:
    print("\n" + "=" * 60)
    print("2. Single arun_with_response() call (async)")
    print("=" * 60)

    config = AgentConfig(
        model=MODEL,
        system_prompt="You are a concise assistant. Keep answers brief.",
        temperature=0.3,
    )
    agent = Agent(config)

    response = await agent.arun_with_response(
        "Explain TCP vs UDP in one sentence each."
    )
    print_response("arun_with_response", response)
    return response


# ---------------------------------------------------------------------------
# 3. Accumulate cost across multiple calls in a session
# ---------------------------------------------------------------------------


def demo_cost_accumulation() -> None:
    print("\n" + "=" * 60)
    print("3. Accumulating cost across multiple runs")
    print("=" * 60)

    config = AgentConfig(
        model=MODEL,
        system_prompt="You are a helpful coding assistant. Be concise.",
        temperature=0.2,
    )
    agent = Agent(config)

    questions = [
        "What does the 'yield' keyword do in Python?",
        "When should I use a list comprehension vs a generator expression?",
        "Give me a one-line example of a generator expression.",
    ]

    total_prompt_tokens = 0
    total_completion_tokens = 0
    total_tokens = 0
    total_cost = 0.0
    cost_available = False

    print(
        f"\nAsking {len(questions)} questions to the same agent (history accumulates):\n"
    )

    for i, question in enumerate(questions, start=1):
        response = agent.run_with_response(question)
        print_response(f"Q{i}: {question[:50]}...", response)

        # Accumulate: guard against None (local models)
        if response.prompt_tokens is not None:
            total_prompt_tokens += response.prompt_tokens
        if response.completion_tokens is not None:
            total_completion_tokens += response.completion_tokens
        if response.tokens_used is not None:
            total_tokens += response.tokens_used
        if response.cost is not None:
            total_cost += response.cost
            cost_available = True

    print("\n" + "-" * 40)
    print("SESSION TOTALS")
    print("-" * 40)
    print(f"Prompt tokens     : {total_prompt_tokens}")
    print(f"Completion tokens : {total_completion_tokens}")
    print(f"Total tokens      : {total_tokens}")
    if cost_available:
        print(f"Total cost        : ${total_cost:.6f}")
        print(f"Average per call  : ${total_cost / len(questions):.6f}")
    else:
        print("Total cost        : N/A (local model)")


# ---------------------------------------------------------------------------
# 4. Demonstrate accessing the raw AgentResponse fields directly
# ---------------------------------------------------------------------------


def demo_field_access() -> None:
    print("\n" + "=" * 60)
    print("4. Direct field access on AgentResponse")
    print("=" * 60)

    config = AgentConfig(model=MODEL, temperature=0.1)
    agent = Agent(config)

    response: AgentResponse = agent.run_with_response("Name three sorting algorithms.")

    # AgentResponse is a Pydantic model: all fields are typed attributes
    assert isinstance(response.content, str)
    assert response.model == MODEL

    # Token fields are Optional[int]: check before using
    if response.tokens_used is not None:
        print(f"Used {response.tokens_used} tokens total.")

    # Cost is Optional[float]: may be None for local/unknown models
    if response.cost is not None:
        budget = 0.01  # $0.01 example budget
        remaining = budget - response.cost
        print(
            f"Cost ${response.cost:.6f} of ${budget:.2f} budget. Remaining: ${remaining:.4f}"
        )
    else:
        print("Cost not available for this model.")

    print(f"\nFull response content:\n{response.content}")


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    demo_single_run()
    asyncio.run(demo_async_run())
    demo_cost_accumulation()
    demo_field_access()