Last Updated: December 24, 2025
Basic API Setup
pip install anthropic
Install Python SDK
npm install @anthropic-ai/sdk
Install Node.js SDK
export ANTHROPIC_API_KEY='your_key_here'
Set API key as environment variable
import anthropic
client = anthropic.Anthropic(
api_key="your_api_key_here"
)
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{"role": "user", "content": "Hello, Claude"}
]
)
print(message.content)
Available Models
| Model | ID | Best For |
|---|---|---|
| Claude Opus 4.5 | claude-opus-4-5-20251101 |
Complex tasks, maximum intelligence |
| Claude Sonnet 4.5 | claude-sonnet-4-5-20250929 |
Balanced performance, most tasks |
| Claude Haiku 3.5 | claude-3-5-haiku-20241022 |
Fast responses, simple tasks |
Function Calling (Tool Use)
tools = [
{
"name": "get_weather",
"description": "Get current weather for a location",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City name"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "Temperature unit"
}
},
"required": ["location"]
}
}
]
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=tools,
messages=[
{"role": "user", "content": "What's the weather in San Francisco?"}
]
)
# Handle tool use response
if message.stop_reason == "tool_use":
for content in message.content:
if content.type == "tool_use":
tool_name = content.name
tool_input = content.input
# Execute your function here
result = get_weather(**tool_input)
# Send result back to Claude
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=tools,
messages=[
{"role": "user", "content": "What's the weather?"},
{"role": "assistant", "content": message.content},
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": content.id,
"content": str(result)
}
]
}
]
)
Streaming Responses
with client.messages.stream(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{"role": "user", "content": "Write a story"}
]
) as stream:
for text in stream.text_stream:
print(text, end="", flush=True)
# With event handling
with client.messages.stream(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": "Hello"}]
) as stream:
for event in stream:
if event.type == "content_block_start":
print("Content started")
elif event.type == "content_block_delta":
print(event.delta.text, end="")
elif event.type == "message_stop":
print("\nDone")
Prompt Caching
cache_control: {"type": "ephemeral"}
Cache content blocks for 5 minutes
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system=[
{
"type": "text",
"text": "You are an AI assistant with expertise in Python.",
"cache_control": {"type": "ephemeral"}
}
],
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": large_code_base,
"cache_control": {"type": "ephemeral"}
},
{
"type": "text",
"text": "Explain this code"
}
]
}
]
)
# Cache hits reduce cost by 90% and latency significantly
Vision Capabilities
import base64
with open("image.jpg", "rb") as image_file:
image_data = base64.standard_b64encode(image_file.read()).decode("utf-8")
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image_data
}
},
{
"type": "text",
"text": "Describe this image"
}
]
}
]
)
# Supported formats: JPEG, PNG, GIF, WebP
# Max size: 5MB per image
System Prompts & Context
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
system=[
{
"type": "text",
"text": "You are a helpful AI coding assistant.",
},
{
"type": "text",
"text": "Always provide code examples with explanations.",
"cache_control": {"type": "ephemeral"}
}
],
messages=[
{"role": "user", "content": "Help me with Python"}
]
)
# System prompts strongly influence Claude's behavior
Message History & Multi-turn Conversations
conversation = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "2+2 equals 4."},
{"role": "user", "content": "What about 3+3?"}
]
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=conversation
)
# Add response to conversation
conversation.append({
"role": "assistant",
"content": message.content[0].text
})
Temperature & Sampling Parameters
| Parameter | Range | Description |
|---|---|---|
temperature |
0.0 - 1.0 | Randomness (0 = focused, 1 = creative) |
top_p |
0.0 - 1.0 | Nucleus sampling threshold |
top_k |
1 - 500 | Limit token selection to top K |
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
temperature=0.7,
top_p=0.9,
messages=[{"role": "user", "content": "Be creative"}]
)
Error Handling
from anthropic import (
APIError,
RateLimitError,
APIConnectionError,
AuthenticationError
)
try:
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{"role": "user", "content": "Hello"}]
)
except RateLimitError:
print("Rate limit exceeded, wait before retrying")
except AuthenticationError:
print("Invalid API key")
except APIConnectionError:
print("Network connection failed")
except APIError as e:
print(f"API error: {e}")
Best Practices
Use cache_control for repeated context
Reduce costs by 90% on cached content
Stream for long responses
Improve user experience with real-time output
Set max_tokens appropriately
Avoid unnecessary costs and timeouts
Use system prompts for consistency
Define behavior once, apply to all messages
Implement exponential backoff
Handle rate limits gracefully
Use temperature=0 for deterministic output
Get consistent results for same inputs
Pricing Optimization
| Strategy | Savings |
|---|---|
| Prompt caching (cache hits) | 90% on cached tokens |
| Use Haiku for simple tasks | 10x cheaper than Opus |
| Reduce max_tokens | Pay only for what you need |
| Batch requests | Reduce API overhead |
💡 Pro Tip: Use prompt caching for large codebases, documentation, or any repeated context. Cache the system prompt and large context blocks, then vary only the user questions. This can reduce costs by 90% and improve response times significantly.