Cost Tracking
run_with_response() vs run()
Section titled “run_with_response() vs run()”run() returns a plain string. run_with_response() returns an AgentResponse that includes the text reply, every tool call made, and token/cost metadata from the LiteLLM response.
from cyclops import Agent, AgentConfig
config = AgentConfig(model="gpt-4o-mini")agent = Agent(config)
result = agent.run_with_response("Summarise the theory of relativity in two sentences.")
print(result.content)print(f"Model : {result.model}")print(f"Prompt : {result.prompt_tokens} tokens")print(f"Completion : {result.completion_tokens} tokens")print(f"Total : {result.tokens_used} tokens")print(f"Cost : ${result.cost:.6f}")The async version works identically:
import asyncio
async def main(): result = await agent.arun_with_response("Explain neural networks briefly.") print(result.content) print(f"Cost: ${result.cost:.6f}")
asyncio.run(main())AgentResponse fields
Section titled “AgentResponse fields”| Field | Type | Description |
|---|---|---|
content | str | The final text response from the agent. |
tool_calls | List[ToolCall] | Every tool call made during the run, in order. |
model | str | The model identifier used (from AgentConfig.model). |
tokens_used | int or None | Total tokens consumed (prompt + completion). None if unavailable. |
prompt_tokens | int or None | Tokens in the prompt/context sent to the model. |
completion_tokens | int or None | Tokens in the generated response. |
cost | float or None | Estimated USD cost from LiteLLM’s pricing table. None for self-hosted models. |
Each ToolCall in tool_calls has:
| Field | Type | Description |
|---|---|---|
id | str | Unique call identifier assigned by the LLM. |
name | str | Name of the tool that was called. |
arguments | dict | Arguments the model passed to the tool. |
result | Any or None | The value returned by the tool. |
Accumulating cost across multiple calls
Section titled “Accumulating cost across multiple calls”Track totals across many runs with a simple dataclass:
import asynciofrom dataclasses import dataclass, fieldfrom typing import Listfrom cyclops import Agent, AgentConfigfrom cyclops.core.types import AgentResponse
@dataclassclass UsageTracker: total_cost: float = 0.0 total_tokens: int = 0 runs: List[AgentResponse] = field(default_factory=list)
def record(self, response: AgentResponse) -> None: self.runs.append(response) self.total_cost += response.cost or 0.0 self.total_tokens += response.tokens_used or 0
def report(self) -> None: print(f"Runs : {len(self.runs)}") print(f"Total cost : ${self.total_cost:.4f}") print(f"Total tokens : {self.total_tokens:,}")
tracker = UsageTracker()config = AgentConfig(model="gpt-4o-mini")agent = Agent(config)
questions = [ "What is photosynthesis?", "Explain DNS in one sentence.", "What is the boiling point of water in Celsius?",]
for q in questions: result = agent.run_with_response(q) tracker.record(result) print(f"Q: {q}") print(f"A: {result.content}") print(f" cost={result.cost:.6f} tokens={result.tokens_used}\n")
tracker.report()Inspecting tool calls
Section titled “Inspecting tool calls”When tools are used, each invocation is recorded in result.tool_calls:
from cyclops.toolkit import tool
@tooldef lookup(symbol: str) -> str: """Look up a stock price.""" return "189.50"
config = AgentConfig(model="gpt-4o-mini")agent = Agent(config, tools=[lookup])
result = agent.run_with_response("What is the current price of AAPL?")
print(result.content)for tc in result.tool_calls: print(f" Called: {tc.name}({tc.arguments}) -> {tc.result}")Full example
Section titled “Full example”"""Cost and token tracking example: run_with_response() / arun_with_response()
Demonstrates: 1. Inspecting AgentResponse metadata (tokens_used, cost, prompt_tokens, completion_tokens) from a single synchronous run 2. Async variant with arun_with_response() 3. Accumulating cost and tokens across multiple calls to track a session budget
Cost data is powered by LiteLLM's built-in pricing table. The numbers areaccurate for cloud APIs (OpenAI, Anthropic, etc.) but will be None forlocal/free models like Ollama since they have no billing cost.
Default model is gpt-4o-mini so the cost fields are populated with real values.Swap MODEL below if you only have a local Ollama instance available."""
import asyncio
from cyclops import Agent, AgentConfig, AgentResponse
# ---------------------------------------------------------------------------# Model configuration# ---------------------------------------------------------------------------
# gpt-4o-mini gives real cost data (requires OPENAI_API_KEY)MODEL = "gpt-4o-mini"
# For a free/local alternative, swap to:# MODEL = "ollama/qwen3:4b"# Note: cost will be None for Ollama: token counts still work.
# Other cloud alternatives with cost data:# MODEL = "claude-3-haiku-20240307" # ANTHROPIC_API_KEY# MODEL = "groq/llama-3.1-8b-instant" # GROQ_API_KEY (cost is ~$0 but tracked)
# ---------------------------------------------------------------------------# Helper: pretty-print an AgentResponse# ---------------------------------------------------------------------------
def print_response(label: str, response: AgentResponse) -> None: print(f"\n--- {label} ---") print( f"Content : {response.content[:120]}{'...' if len(response.content) > 120 else ''}" ) print(f"Model : {response.model}") print(f"Prompt tokens : {response.prompt_tokens}") print(f"Completion tokens: {response.completion_tokens}") print(f"Total tokens : {response.tokens_used}") if response.cost is not None: print(f"Cost : ${response.cost:.6f}") else: print("Cost : N/A (local model or no pricing data)") if response.tool_calls: print(f"Tool calls : {[tc.name for tc in response.tool_calls]}")
# ---------------------------------------------------------------------------# 1. Single synchronous call# ---------------------------------------------------------------------------
def demo_single_run() -> AgentResponse: print("=" * 60) print("1. Single run_with_response() call") print("=" * 60)
config = AgentConfig( model=MODEL, system_prompt="You are a concise assistant. Keep answers brief.", temperature=0.3, ) agent = Agent(config)
response = agent.run_with_response("What is the difference between RAM and ROM?") print_response("run_with_response", response) return response
# ---------------------------------------------------------------------------# 2. Single async call# ---------------------------------------------------------------------------
async def demo_async_run() -> AgentResponse: print("\n" + "=" * 60) print("2. Single arun_with_response() call (async)") print("=" * 60)
config = AgentConfig( model=MODEL, system_prompt="You are a concise assistant. Keep answers brief.", temperature=0.3, ) agent = Agent(config)
response = await agent.arun_with_response( "Explain TCP vs UDP in one sentence each." ) print_response("arun_with_response", response) return response
# ---------------------------------------------------------------------------# 3. Accumulate cost across multiple calls in a session# ---------------------------------------------------------------------------
def demo_cost_accumulation() -> None: print("\n" + "=" * 60) print("3. Accumulating cost across multiple runs") print("=" * 60)
config = AgentConfig( model=MODEL, system_prompt="You are a helpful coding assistant. Be concise.", temperature=0.2, ) agent = Agent(config)
questions = [ "What does the 'yield' keyword do in Python?", "When should I use a list comprehension vs a generator expression?", "Give me a one-line example of a generator expression.", ]
total_prompt_tokens = 0 total_completion_tokens = 0 total_tokens = 0 total_cost = 0.0 cost_available = False
print( f"\nAsking {len(questions)} questions to the same agent (history accumulates):\n" )
for i, question in enumerate(questions, start=1): response = agent.run_with_response(question) print_response(f"Q{i}: {question[:50]}...", response)
# Accumulate: guard against None (local models) if response.prompt_tokens is not None: total_prompt_tokens += response.prompt_tokens if response.completion_tokens is not None: total_completion_tokens += response.completion_tokens if response.tokens_used is not None: total_tokens += response.tokens_used if response.cost is not None: total_cost += response.cost cost_available = True
print("\n" + "-" * 40) print("SESSION TOTALS") print("-" * 40) print(f"Prompt tokens : {total_prompt_tokens}") print(f"Completion tokens : {total_completion_tokens}") print(f"Total tokens : {total_tokens}") if cost_available: print(f"Total cost : ${total_cost:.6f}") print(f"Average per call : ${total_cost / len(questions):.6f}") else: print("Total cost : N/A (local model)")
# ---------------------------------------------------------------------------# 4. Demonstrate accessing the raw AgentResponse fields directly# ---------------------------------------------------------------------------
def demo_field_access() -> None: print("\n" + "=" * 60) print("4. Direct field access on AgentResponse") print("=" * 60)
config = AgentConfig(model=MODEL, temperature=0.1) agent = Agent(config)
response: AgentResponse = agent.run_with_response("Name three sorting algorithms.")
# AgentResponse is a Pydantic model: all fields are typed attributes assert isinstance(response.content, str) assert response.model == MODEL
# Token fields are Optional[int]: check before using if response.tokens_used is not None: print(f"Used {response.tokens_used} tokens total.")
# Cost is Optional[float]: may be None for local/unknown models if response.cost is not None: budget = 0.01 # $0.01 example budget remaining = budget - response.cost print( f"Cost ${response.cost:.6f} of ${budget:.2f} budget. Remaining: ${remaining:.4f}" ) else: print("Cost not available for this model.")
print(f"\nFull response content:\n{response.content}")
# ---------------------------------------------------------------------------# Entry point# ---------------------------------------------------------------------------
if __name__ == "__main__": demo_single_run() asyncio.run(demo_async_run()) demo_cost_accumulation() demo_field_access()