What is RAG?
Retrieval Augmented Generation (RAG) is a technique that enhances LLM responses by providing relevant external context. Instead of relying solely on the model's training data, RAG retrieves real-time information to generate more accurate, up-to-date responses.
Why Use Keiro for RAG?
Traditional RAG pipelines require:
Crawling web pages
Extracting content
Chunking text
Embedding and storing vectors
Similarity search
Context injection
Keiro simplifies this by handling steps 1-4 automatically:
const API_URL = "https://kierolabs.space/api/search-pro";
const payload = {
query: "future of ai agents",
apiKey: "YOUR_API_KEY"
};
const response = await fetch(API_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload)
});
const data = await response.json();
// Process results
console.log(`Credits remaining: ${data.creditsRemaining}`);
data.data?.extracted_content?.slice(0, 3).forEach(item => {
console.log(`- ${item.title}`);
console.log(` ${item.url}`);
});
Basic RAG Implementation
Step 1: Set Up Your Environment
pip install openai
Step 2: Create the Retrieval Function
import requests
import openai
OPENAI_API_KEY = "your-openai-key"
KEIRO_API_KEY = "your-keiro-key"
openai.api_key = OPENAI_API_KEY
API_URL = "https://kierolabs.space/api/search-pro"
def retrieve_context(query: str, num_results: int = 5) -> str:
"""Retrieve relevant context using Keiro Research API."""
payload = {
"query": query,
"apiKey": KEIRO_API_KEY,
"cache_search": True
}
response = requests.post(API_URL, json=payload)
data = response.json()
# Extract content
extracted = data.get("data", {}).get("extracted_content", [])[:num_results]
context_parts = []
for item in extracted:
context_parts.append(f"""
Source: {item.get("title")}
URL: {item.get("url")}
Content: {item.get("content", "")[:1000]}
---
""")
return "\n".join(context_parts)
Step 3: Build the RAG Function
def rag_query(user_question: str) -> str:
"""Answer questions using RAG."""
# Step 1: Retrieve context
context = retrieve_context(user_question)
# Step 2: Generate response with context
response = openai.chat.completions.create(
model="gpt-4o-mini", # better + cheaper than gpt-4
messages=[
{
"role": "system",
"content": (
"You are a helpful assistant. Answer the user's question "
"based ONLY on the provided context. "
"If the context does not contain the answer, say "
"'I could not find relevant information in the context.'\n\n"
f"Context:\n{context}"
)
},
{
"role": "user",
"content": user_question
}
],
temperature=0.3
)
return response.choices[0].message.content
Step 4: Use It
answer = rag_query("What are the latest breakthroughs in fusion energy?")
print(answer)
Advanced RAG Patterns
Multi-Source Research RAG
For complex questions, use Keiro's research endpoint:
import requests
import openai
API_URL = "https://kierolabs.space/api/research-pro"
KEIRO_API_KEY = "your-keiro-key"
def research_rag(query: str) -> str:
"""Deep research with citations using Keiro Research API."""
payload = {
"query": query,
"apiKey": KEIRO_API_KEY,
"cache_search": True
}
response = requests.post(API_URL, json=payload)
data = response.json()
extracted = data.get("data", {}).get("extracted_content", [])[:10]
# Build context with citations
context_parts = []
sources = []
for i, item in enumerate(extracted):
context_parts.append(f"[{i+1}] {item.get('content', '')[:800]}")
sources.append(f"[{i+1}] {item.get('title')} - {item.get('url')}")
context = f"""
Context:
{chr(10).join(context_parts)}
Sources:
{chr(10).join(sources)}
"""
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Answer ONLY using the provided context. "
"Include citation numbers like [1], [2] when referencing sources. "
"If unsure, say you don't know.\n\n"
f"{context}"
)
},
{
"role": "user",
"content": query
}
],
temperature=0.3
)
return response.choices[0].message.contentHybrid Search RAG
Combine semantic search with keyword matching:
import requests
import openai
API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"
def extract_key_terms(query: str):
"""Simple keyword extraction (you can improve this)."""
return query.split()[:5] # basic version
def search_api(query: str):
payload = {
"query": query,
"apiKey": KEIRO_API_KEY,
}
response = requests.post(API_URL, json=payload)
return response.json().get("data", [])
def merge_results(semantic, keyword):
seen_urls = set()
merged = []
for result in semantic + keyword:
url = result.get("url")
if url and url not in seen_urls:
seen_urls.add(url)
merged.append(result)
return merged[:8] # limit context
def hybrid_rag(query: str) -> str:
"""Hybrid RAG: semantic + keyword search."""
# Semantic search
semantic_results = search_api(query)
# Keyword search
key_terms = extract_key_terms(query)
keyword_query = " ".join(key_terms)
keyword_results = search_api(keyword_query)
# Merge
all_results = merge_results(semantic_results, keyword_results)
# Build context
context_parts = []
for i, item in enumerate(all_results):
context_parts.append(f"""
[{i+1}] {item.get("title")}
{item.get("content", "")[:500]}
Source: {item.get("url")}
""")
context = "\n".join(context_parts)
# LLM step
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Answer ONLY using the provided context. "
"Use citations like [1], [2]. If unsure, say you don't know.\n\n"
f"{context}"
)
},
{"role": "user", "content": query}
],
temperature=0.3
)
return response.choices[0].message.contentStreaming RAG
For real-time user experience:
from openai import AsyncOpenAI
client = AsyncOpenAI(api_key="your-openai-key")
async def streaming_rag(query: str):
"""Stream responses as they're generated."""
# Step 1: Retrieve context
context = retrieve_context(query)
# Step 2: Create stream
stream = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"Answer ONLY using the provided context. "
"If the answer is not in the context, say you don't know.\n\n"
f"Context:\n{context}"
)
},
{"role": "user", "content": query}
],
stream=True,
temperature=0.3
)
# Step 3: Stream chunks safely
async for chunk in stream:
delta = chunk.choices[0].delta
if hasattr(delta, "content") and delta.content:
yield delta.content
Production Best Practices
1. Caching
Leverage Keiro's 50% cache discount:
import requests
from functools import lru_cache
API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"
@lru_cache(maxsize=1000)
def cached_retrieve(query: str):
payload = {
"query": query,
"apiKey": KEIRO_API_KEY,
}
response = requests.post(API_URL, json=payload)
return response.json()
def retrieve_with_cache(query: str):
return cached_retrieve(query)
2. Error Handling
import requests
import time
API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"
def robust_retrieve(query: str, max_retries: int = 3):
payload = {
"query": query,
"apiKey": KEIRO_API_KEY,
}
for attempt in range(max_retries):
try:
response = requests.post(API_URL, json=payload, timeout=10)
# Handle HTTP errors
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limit
time.sleep(2 ** attempt)
elif response.status_code >= 500:
# Server error → retry
time.sleep(2 ** attempt)
else:
# Client error (400, 401, etc.) → don't retry
raise Exception(f"Request failed: {response.text}")
except requests.exceptions.RequestException as e:
# Network issues
if attempt == max_retries - 1:
raise
time.sleep(2 ** attempt)
raise Exception("Max retries exceeded")
3. Batch Processing
For high-volume applications:
import requests
from concurrent.futures import ThreadPoolExecutor
API_URL = "https://kierolabs.space/api/batch-search"
API_KEY = "your_api_key_here"
def fetch_query(query: str):
payload = {
"query": query,
"apiKey": API_KEY
}
response = requests.post(API_URL, json=payload)
return response.json()
def batch_search(queries):
results = []
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(fetch_query, q) for q in queries]
for future in futures:
results.append(future.result())
return results
4. Context Window Management
def optimize_context(results, max_tokens: int = 4000):
"""Fit context within token limits."""
context = ""
token_count = 0
for result in results:
result_text = f"{result.title}\n{result.content}\n---\n"
result_tokens = len(result_text.split()) * 1.3 # Rough estimate
if token_count + result_tokens > max_tokens:
break
context += result_text
token_count += result_tokens
return context
Complete Example: Q&A Chatbot
import requests
import openai
class RAGChatbot:
def __init__(self, keiro_key: str, openai_key: str):
self.keiro_api_key = keiro_key
openai.api_key = openai_key
self.conversation_history = []
self.api_url = "https://kierolabs.space/api/search-pro"
def chat(self, user_message: str) -> str:
# Retrieve relevant context
context = self._retrieve(user_message)
# Build messages
messages = [
{
"role": "system",
"content": (
"You are a helpful assistant with access to real-time web information. "
"Use ONLY the provided context to answer accurately. "
"If the answer is not in the context, say you don't know.\n\n"
f"Context:\n{context}"
)
}
] + self.conversation_history + [
{"role": "user", "content": user_message}
]
# Generate response
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
temperature=0.3
)
assistant_message = response.choices[0].message.content
# Update history
self.conversation_history.append({"role": "user", "content": user_message})
self.conversation_history.append({"role": "assistant", "content": assistant_message})
return assistant_message
def _retrieve(self, query: str) -> str:
payload = {
"query": query,
"apiKey": self.keiro_api_key,
}
response = requests.post(self.api_url, json=payload, timeout=10)
data = response.json()
results = data.get("data", [])
return "\n---\n".join([
f"Source: {item.get('title')}\nContent: {item.get('content', '')[:500]}"
for item in results[:5]
])
# Usage
bot = RAGChatbot(keiro_key="your-keiro-key", openai_key="your-openai-key")
response = bot.chat("What happened in tech news today?")
print(response)
Conclusion
Keiro makes building RAG pipelines dramatically simpler and cheaper:
No infrastructure needed - Skip vector databases and crawlers
Real-time data - Always fresh results
Cost-effective - 10x cheaper than alternatives
Production-ready - Built-in caching and batch processing
Get started with Keiro and build your first RAG pipeline in minutes.