Spaces:

sumitrwk
/

omnirouter-api

Sleeping

App Files Files Community

sumitrwk commited on Apr 8

Commit

b534a53

verified ·

1 Parent(s): 4f530ea

Upload 33 files

Browse files

Files changed (33) hide show

Dockerfile +37 -0
requirements.txt +26 -0
seed_db.py +24 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/router.cpython-312.pyc +0 -0
src/__pycache__/schemas.cpython-312.pyc +0 -0
src/agent/__pycache__/graph.cpython-312.pyc +0 -0
src/agent/__pycache__/tools.cpython-312.pyc +0 -0
src/agent/graph.py +125 -0
src/agent/tools.py +59 -0
src/api/__pycache__/cache.cpython-312.pyc +0 -0
src/api/__pycache__/server.cpython-312.pyc +0 -0
src/api/cache.py +41 -0
src/api/server.py +77 -0
src/evaluation/judge.py +77 -0
src/evaluation/run_evals.py +53 -0
src/providers/__init__.py +0 -0
src/providers/__pycache__/__init__.cpython-312.pyc +0 -0
src/providers/__pycache__/anthropic_client.cpython-312.pyc +0 -0
src/providers/__pycache__/base.cpython-312.pyc +0 -0
src/providers/__pycache__/openai_client.cpython-312.pyc +0 -0
src/providers/anthropic_client.py +49 -0
src/providers/base.py +43 -0
src/providers/openai_client.py +75 -0
src/rag/__pycache__/chatbot.cpython-312.pyc +0 -0
src/rag/__pycache__/ingestion.cpython-312.pyc +0 -0
src/rag/__pycache__/vector_store.cpython-312.pyc +0 -0
src/rag/chatbot.py +63 -0
src/rag/ingestion.py +41 -0
src/rag/vector_store.py +61 -0
src/router.py +87 -0
src/schemas.py +61 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+# 1. The Foundation
+# We start with a lightweight, official Linux image with Python 3.12 pre-installed.
+FROM python:3.12-slim
+# 2. Environment variables
+# Prevent python from writing messy .pyc files
+ENV PYTHONDONTWRITEBYTECODE=1
+# Ensure our terminal print() statements show up immediately in cloud logs
+ENV PYTHONUNBUFFERED=1
+# Tell HuggingFaceexactly where to save its 100MB math model
+ENV HF_HOME=/app/.cache/huggingface
+# 3. The Workspace
+# Create a folder inside the container called /app and move inside it
+WORKDIR /app
+# 4. Cache optimization (the Architect's Trick)
+# We ONLY copy the requirements file first.
+# Docker caches steps. If you change your Python code later, Docker won't
+# force you to sit through a 5-minute re-installation of Pandas and LangChain!
+COPY requirements.txt .
+# Install the Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+# 5. COPY the payload
+# Now we copy the rest of your actual code into the container
+COPY . .
+# 6. OPEN the gate
+# Tell the container to allow trafic on port 8000
+EXPOSE 8000
+# 7. The ignition switch
+# The exact terminal command the container runs when it wakes up in the cloud.
+# Notice we use 0.0.0.0 so the cloud provider's router can find it.
+CMD ["uvicorn", "src.api.server:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+pydantic
+openai
+anthropic
+langchain-openai
+langchain
+chromadb
+tiktoken
+langchain-text-splitters
+langchain-core
+langchain-community
+langgraph
+setuptools
+# Free tier...
+langchain-groq
+langchain-huggingface
+sentence-transformers
+python-dotenv
+fastapi
+uvicorn
+requests

seed_db.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from src.rag.vector_store import build_vector_store
+from langchain_core.documents import Document
+import os
+api_key = os.getenv("HF_TOKEN")
+def seed_database():
+    print("Seeding new HuggingFace database...")
+    # 1. Our dummy text
+    sample_text = (
+        "OmniRouter is an enterprise-grade AI architecture combining high-concurrency "
+        "LLM routing and local Vector Database retrieval. If the primary API fails, "
+        "it seamlessly switches to a fallback model. It uses LangGraph for agentic reasoning."
+    )
+    # 2. Package it as a chunk
+    doc = Document(page_content=sample_text, metadata={"source": "manual.pdf"})
+    # 3. Build and save the DB
+    build_vector_store([doc], api_key=api_key)
+if __name__ == "__main__":
+    seed_database()

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (126 Bytes). View file

src/__pycache__/router.cpython-312.pyc ADDED Viewed

Binary file (4.15 kB). View file

src/__pycache__/schemas.cpython-312.pyc ADDED Viewed

Binary file (2.75 kB). View file

src/agent/__pycache__/graph.cpython-312.pyc ADDED Viewed

Binary file (3.42 kB). View file

src/agent/__pycache__/tools.cpython-312.pyc ADDED Viewed

Binary file (2.56 kB). View file

src/agent/graph.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+from typing import Annotated, TypedDict
+# # OpenAI...
+# from langchain_openai import ChatOpenAI
+# Groq LLM...
+from langchain_groq import ChatGroq
+from langchain_core.messages import BaseMessage
+from langgraph.graph import StateGraph, START, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from src.agent.tools import search_documentation
+# Import our custom tool
+from src.agent.tools import search_documentation
+# Fix the infinite loop using system prompt(Guardrail)
+from langchain_core.messages import SystemMessage
+# Human in the loop using LangGraphs checkpointers...
+from langgraph.checkpoint.memory import MemorySaver
+from dotenv import load_dotenv
+load_dotenv()
+# 1. UPGRADED STATE
+class AgentState(TypedDict):
+    # 'add_messages' ensures we append to the history, never overwrite it.
+    messages: Annotated[list[BaseMessage], add_messages]
+# 2. INITIALIZE THE BRAIN
+# We instantiate the LLM and "bind" our tool to it.
+# # Make sure you export GROQ_API_KEY in your terminal before running!
+# os.environ["GROQ_API_KEY"] = "gsk_jd"
+# We use Meta's Llama 3 8B model hosted on Groq for incredible speed
+llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
+# # This sends the JSON schema we looked at yesterday directly to OpenAI.
+# llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
+tools = [search_documentation]
+llm_with_tools = llm.bind_tools(tools)
+# 3. THE NODES
+"""
+Chat node for seamless interaction with the LLM model...
+"""
+def chatbot_node(state: AgentState):
+    """
+    This node intercepts the history, adds strict behavioral rules,
+    and then passes it to the LLM.
+    The LLM will either return a standard text message, OR a special "ToolCall" message.
+    """
+    print("\n--- [NODE: Chatbot] Thinking... ---")
+    # 1. The Circuit Breaker System Prompt --> Prompt based Flow control
+    system_message = SystemMessage(content=(
+        "You are an elite AI Engineering assistant. "
+        "You have access to a search_documentation tool. "
+        "CRITICAL RULE: If you use the tool and it returns 'No relevant information found', "
+        "you MUST NOT use the tool again. Immediately stop and tell the user "
+        "'I do not have enough information to answer that based on the documentation.' "
+        "Do not guess. Do not hallucinate."
+    ))
+    # 2. Prepend the system rules to the chat history
+    messages_to_send = [system_message] + state["messages"]
+    # 3. Invoke the LLM with the strict rules applied
+    response = llm_with_tools.invoke(messages_to_send)
+    # We return the message wrapped in a list to trigger the 'add_messages' append behavior
+    return {"messages": [response]}
+# LangGraph has a built-in node specifically for executing tools!
+# It reads the "ToolCall" message, runs our Python function, and returns a "ToolMessage".
+tool_node = ToolNode(tools=tools)
+# =============================================
+# 4. COMPILE THE GRAPH with human in the loop...
+# =============================================
+workflow = StateGraph(AgentState)
+# Add our two worker nodes
+workflow.add_node("chatbot", chatbot_node)
+workflow.add_node("tools", tool_node)
+# Set the entry point
+workflow.add_edge(START, "chatbot")
+# 5. THE MAGIC ROUTING
+# 'tools_condition' is a built-in LangGraph edge.
+# It looks at the last message from the chatbot.
+# If it has a tool call, it routes to "tools". If it's just text, it routes to END.
+workflow.add_conditional_edges("chatbot", tools_condition)
+# After a tool finishes running, ALWAYS loop back to the chatbot
+# so it can read the database results and formulate a final answer!
+workflow.add_edge("tools", "chatbot")
+# Initialize the short-term memory vault
+memory = MemorySaver()
+# Compile with the memory and the breakpoint!
+app = workflow.compile(
+    checkpointer=memory,
+    interrupt_before=["tools"]  # tell the graph to pause before executing this node.
+)
+from langchain_core.messages import HumanMessage
+if __name__ == "__main__":
+    # Ensure your API key is available
+    # os.environ["OPENAI_API_KEY"] = "YOUR_REAL_OPENAI_API_KEY"
+    print("========== AGENT TEST ==========")
+    initial_state = {
+        "messages": [HumanMessage(content="What does OmniRouter do?")]
+    }
+    # stream() allows us to see the exact output of each node as it executes!
+    for event in app.stream(initial_state):
+        for node_name, node_state in event.items():
+            print(f"Update from node '{node_name}':")
+            # Print the content of the very last message added to the state
+            print(f" -> {node_state['messages'][-1].content}\n")

src/agent/tools.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Let's officially give your agent its brain.
+We are going to use LangChain's @tool decorator.
+This magical little wrapper takes your standard Python function,
+reads the type hints (like query: str), reads the docstring, and automatically
+translates the entire thing into a strict JSON schema that OpenAI and Anthropic natively understand .
+"""
+import os
+from langchain_core.tools import tool
+from src.rag.vector_store import get_vector_store
+# The @tool decorator converts this Python function into an LLM-readable JSON schema
+@tool
+def search_documentation(query: str) -> str:
+    """
+    Searches the internal engineering documentation for information about the OmniRouter,
+    LangChain, LangGraph, or general AI engineering concepts.
+    Use this tool WHENEVER the user asks a technical question about how the system works,
+    fallback protocols, or specific coding architecture. Do NOT use this for general greetings.
+    Args:
+        query: The specific search term to look up in the database.
+               It should be a standalone, highly descriptive phrase.
+    """
+    print(f"\n--- [TOOL EXECUTION] Searching Vector DB for: '{query}' ---")
+    # In production, ensure your API key is loaded securely
+    api_key = os.getenv("HF_TOKEN")
+    try:
+        db = get_vector_store(api_key)
+        # Retrieve the top 2 most relevant chunks
+        results = db.similarity_search(query, k=2)
+        if not results:
+            return "No relevant information found in the documentation."
+        # We must return a STRING, not a list of objects, so the LLM can read it easily
+        combined_text = "\n\n".join([doc.page_content for doc in results])
+        return combined_text
+    except Exception as e:
+        # FIXED: Print the error to the server terminal so we can see it!
+        print(f"\n🚨 [TOOL CRASHED]: {str(e)}")
+        return f"Error executing search: {str(e)}"
+if __name__ == "__main__":
+    # Print the name the LLM sees
+    print(f"Tool Name: {search_documentation.name}")
+    # Print the description the LLM reads to make its decision
+    print(f"\nTool Description: \n{search_documentation.description}")
+    # Print the strict JSON schema the LLM must follow to use the tool
+    print(f"\nTool Arguments Schema: \n{search_documentation.args}")

src/api/__pycache__/cache.cpython-312.pyc ADDED Viewed

Binary file (2.05 kB). View file

src/api/__pycache__/server.cpython-312.pyc ADDED Viewed

Binary file (2.72 kB). View file

src/api/cache.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+CACHE_DIR = "./semantic_cache_db"
+def get_cache_db():
+    """Initializes the Cache Database using free local embeddings."""
+    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    return Chroma(persist_directory=CACHE_DIR, embedding_function=embeddings)
+def check_cache(query: str, threshold: float = 0.5) -> str | None:
+    """
+    Embeds the user's question and mathematically checks if anyone
+    has asked a highly similar question before.
+    """
+    db = get_cache_db()
+    # We search the cache and ask for the 'distance score'
+    results = db.similarity_search_with_score(query, k=1)
+    if results:
+        doc, score = results[0]
+        # In ChromaDB's default math (L2 distance), a LOWER score means it's MORE similar.
+        # 0.0 is an exact match. 0.5 means "very similar meaning".
+        if score < threshold:
+            print(f"\n🟢 [CACHE HIT] Similar question found! (Score: {score:.3f})")
+            return doc.metadata.get("answer")
+    print("\n🔴 [CACHE MISS] Question is new.")
+    return None
+def save_to_cache(query: str, answer: str):
+    """Saves a brand new question and its answer into the database."""
+    db = get_cache_db()
+    # The 'page_content' is the question. The 'metadata' holds the answer.
+    doc = Document(page_content=query, metadata={"answer": answer})
+    db.add_documents([doc])
+    print("\n💾 [CACHE SAVED] New interaction stored for future users.")

src/api/server.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import json
+import asyncio
+from fastapi import FastAPI  #, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from typing import List
+from langchain_core.messages import HumanMessage, AIMessage
+# Import our compiled LangGraph agent
+from src.agent.graph import app as agent_app
+from src.api.cache import check_cache, save_to_cache
+# 1. Initialize the FastAPI Server
+app = FastAPI(
+    title="OmniRouter Streaming API Agent",
+    description="Enterprise RAG Agent powered by LangGraph and FastAPI",
+    version="1.0.1"
+)
+# 2. Define our Request Schema using Pydantic
+class ChatRequest(BaseModel):
+    query: str
+async def stream_generator(query: str):
+    # ==========================================
+    # 1. THE CACHE LAYER (Lightning Fast)
+    # ==========================================
+    cached_answer = check_cache(query)
+    if cached_answer:
+        # We chop the cached string into words and stream them instantly
+        for word in cached_answer.split(" "):
+            yield f"data: {json.dumps({'token': word + ' '})}\n\n"
+            # We add a tiny 20ms sleep just to preserve the "typewriter" feel for the user
+            await asyncio.sleep(0.02)
+        return # EXIT EARLY! The LLM is never triggered.
+    """
+    An async generator that yields tokens from the LangGraph agent
+    in a format compatible with Server-Sent Events (SSE).
+    """
+    # ==========================================
+    # 2. THE AGENT LAYER (Heavy Compute)
+    # ==========================================
+    initial_state = {"messages": [HumanMessage(content=query)]}
+    full_answer = "" # We need to collect the tokens to save them later
+    # .astream_events is the key to deep-access streaming in LangChain/LangGraph
+    async for event in agent_app.astream_events(initial_state, version="v1"):
+        kind = event["event"]
+        # We are looking for the 'on_chat_model_stream' event
+        # This triggers every time a new token is generated by the LLM
+        if kind == "on_chat_model_stream":
+            content = event["data"]["chunk"].content
+            if content:
+                full_answer += content
+                # SSE format requires the "data: " prefix
+                yield f"data: {json.dumps({'token': content})}\n\n"
+    # ==========================================
+    # 3. SAVE FOR THE FUTURE
+    # ==========================================
+    # Only cache if we got an answer, AND the answer isn't our fallback failure phrase
+    failure_phrase = "I do not have enough information"
+    if full_answer and failure_phrase not in full_answer:
+        save_to_cache(query, full_answer)
+    else:
+        print("\n⚠️ [CACHE SKIP] Agent failed to answer. Did not poison the cache.")
+@app.post("/chat/stream")
+async def chat_streaming_endpoint(request: ChatRequest):
+    return StreamingResponse(
+        stream_generator(request.query),
+        media_type="text/event-stream"
+    )

src/evaluation/judge.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+We are going to create a strict grading script using LangChain's
+**with_structured_output**. This forces our Judge LLM to return a strict JSON
+object containing an integer score (1 for Pass, 0 for Fail) and a reasoning string.
+"""
+from pydantic import BaseModel, Field
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from dotenv import load_dotenv
+load_dotenv()
+# ==========================================
+# 1. The Strict Grading Schema
+# ==========================================
+class HallucinationScore(BaseModel):
+    score: int = Field(description="Return 1 if perfectly grounded. Return 0 if hallucinated.")
+    reasoning: str = Field(description="A 1-sentence explanation of why you gave this score.")
+# ==========================================
+# 2. Initialize the Impartial Judge
+# ==========================================
+# We use temperature=0 because we want strict, deterministic grading, not creativity!
+model_name_1 = "llama-3.1-70b-versatile"
+model_name_2 = "llama-3.1-8b-instant"
+judge_llm = ChatGroq(model=model_name_2, temperature=0)
+structured_judge = judge_llm.with_structured_output(HallucinationScore)
+# ==========================================
+# 3. The Grading Rubric (System Prompt)
+# ==========================================
+system_prompt = """You are an impartial AI Compliance Judge evaluating an Agent's response.
+You will be given the 'Retrieved Context' from the database, and the 'Agent Answer'.
+Your ONLY job is to check for HALLUCINATIONS.
+RULE:
+- If the Agent's answer contains ANY factual information, names, or numbers that are NOT present in the Retrieved Context, score it a 0.
+- If the Agent's answer is strictly based ONLY on the context, score it a 1.
+- Do not grade grammar or tone. Only grade factual grounding.
+"""
+prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    ("human", "Retrieved Context: \n\n {context} \n\n Agent Answer: \n\n {answer}")
+])
+evaluator = prompt | structured_judge
+def check_hallucination(context: str, answer: str):
+    print("\n⚖️ [JUDGE] Evaluating answer for hallucinations...")
+    try:
+        result = evaluator.invoke({"context": context, "answer": answer})
+        return result
+    except Exception as e:
+        print(f"Judge Error: {e}")
+        return None
+if __name__ == "__main__":
+    # The reality: What our Vector DB actually found.
+    simulated_context = (
+        "OmniRouter is an AI architecture that routes LLM requests. "
+        "It supports OpenAI and Anthropic APIs."
+    )
+    print("\n========== TEST 1: The Good Agent ==========")
+    good_answer = "OmniRouter routes requests and works with Anthropic and OpenAI."
+    good_result = check_hallucination(simulated_context, good_answer)
+    print(f"Score: {good_result.score}/1")
+    print(f"Reasoning: {good_result.reasoning}")
+    print("\n========== TEST 2: The Hallucinating Agent ==========")
+    bad_answer = "OmniRouter routes requests and works with OpenAI, Anthropic, and Google Gemini."
+    bad_result = check_hallucination(simulated_context, bad_answer)
+    print(f"Score: {bad_result.score}/1")
+    print(f"Reasoning: {bad_result.reasoning}")

src/evaluation/run_evals.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from langchain_core.messages import HumanMessage, ToolMessage, AIMessage
+# Import your real compiled agent and your real judge
+from src.agent.graph import app
+from src.evaluation.judge import check_hallucination
+def evaluate_real_agent(query: str):
+    print(f"\n==================================================")
+    print(f"🚀 RUNNING REAL AGENT EVALUATION")
+    print(f"Query: '{query}'")
+    print(f"==================================================")
+    # 1. Trigger the Real Agent
+    initial_state = {"messages": [HumanMessage(content=query)]}
+    config = {"configurable": {"thread_id": "automated_eval_run_1"}}
+    print("\n🤖 Agent is thinking and searching...")
+    # We use .invoke() here because we don't need streaming for a backend test
+    final_state = app.invoke(initial_state, config)
+    # 2. Extract the Dynamic Data from the State Machine's Memory
+    retrieved_context = ""
+    final_answer = ""
+    for msg in final_state["messages"]:
+        # Find the exact text the ChromaDB tool returned
+        if isinstance(msg, ToolMessage):
+            retrieved_context += msg.content + "\n"
+        # Find the final answer the Agent generated
+        elif isinstance(msg, AIMessage) and msg.content:
+            final_answer = msg.content
+    if not retrieved_context:
+        print("⚠️ Agent did not use the database. Cannot run Hallucination check.")
+        return
+    # 3. Pass the dynamic data to the Judge
+    result = check_hallucination(context=retrieved_context, answer=final_answer)
+    # 4. Print the final Evaluation Report
+    print(f"\n📊 EVALUATION REPORT")
+    print(f"Score: {result.score} / 1")
+    if result.score == 1:
+        print("✅ PASS: Answer is completely grounded in the database.")
+    else:
+        print("❌ FAIL: Hallucination detected!")
+    print(f"Judge's Reasoning: {result.reasoning}")
+    print(f"==================================================\n")
+if __name__ == "__main__":
+    # Test our agent with a real query!
+    evaluate_real_agent("What is OmniRouter and what does it do?")

src/providers/__init__.py ADDED Viewed

File without changes

src/providers/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (136 Bytes). View file

src/providers/__pycache__/anthropic_client.cpython-312.pyc ADDED Viewed

Binary file (2.92 kB). View file

src/providers/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/providers/__pycache__/openai_client.cpython-312.pyc ADDED Viewed

Binary file (3.23 kB). View file

src/providers/anthropic_client.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from anthropic import AsyncAnthropic
+import logging
+from src.schemas import RouterConfig, LLMResponse
+from src.providers.base import BaseLLMProvider
+logger = logging.getLogger(__name__)
+class AnthropicProvider(BaseLLMProvider):
+    def __init__(self, api_key: str):
+        super().__init__(api_key)
+        self.client = AsyncAnthropic(api_key=self.api_key)
+    async def async_generate(self, prompt: str, config: RouterConfig) -> LLMResponse:
+        logger.info(f"Routing request to Anthropic using model: {config.model}")
+        # Anthropic's API structure is slightly different from OpenAI's
+        response = await self.client.messages.create(
+            model=config.model,
+            max_tokens=1024, # Anthropic requires max_tokens to be explicitly set
+            messages=[{"role": "user", "content": prompt}],
+            temperature=config.temperature,
+        )
+        content = response.content[0].text
+        prompt_tokens = response.usage.input_tokens
+        completion_tokens = response.usage.output_tokens
+        cost = self.calculate_cost(prompt_tokens, completion_tokens, config.model)
+        return LLMResponse(
+            content=content,
+            provider_used="anthropic",
+            model_used=config.model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            cost_estimate=cost
+        )
+    def calculate_cost(self, prompt_tokens: int, completion_tokens: int, model_name: str) -> float:
+        pricing = {
+            "claude-3-opus-20240229": {"prompt": 15.0, "completion": 75.0},
+            "claude-3-5-sonnet-20240620": {"prompt": 3.0, "completion": 15.0},
+            "claude-3-haiku-20240307": {"prompt": 0.25, "completion": 1.25}
+        }
+        rates = pricing.get(model_name, {"prompt": 0.0, "completion": 0.0})
+        cost = (prompt_tokens / 1_000_000) * rates["prompt"] + \
+               (completion_tokens / 1_000_000) * rates["completion"]
+        return round(cost, 6)

src/providers/base.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+The abstract blueprint every provider must follow
+"""
+from abc import ABC, abstractmethod
+from typing import Optional
+from src.schemas import RouterConfig, LLMResponse
+class BaseLLMProvider(ABC):
+    """
+    The strict blueprint that ALL LLM providers must follow.
+    If a developer tries to create a provider without an 'async_generate' method,
+    Python will throw a TypeError upon instantiation.
+    """
+    def __init__(self, api_key: str):
+        # Every provider needs an API key (or a dummy key for local models)
+        self.api_key = api_key
+    @abstractmethod
+    async def async_generate(
+        self,
+        prompt: str,
+        config: RouterConfig
+    ) -> LLMResponse:
+        """
+        The core engine method.
+        Takes a string prompt and our strict RouterConfig.
+        MUST return our strictly typed LLMResponse.
+        """
+        pass
+    @abstractmethod
+    def calculate_cost(
+        self,
+        prompt_tokens: int,
+        completion_tokens: int,
+        model_name: str
+    ) -> float:
+        """
+        Calculates the estimated cost of the API call.
+        Essential for production monitoring.
+        """
+        pass

src/providers/openai_client.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from openai import AsyncOpenAI
+import logging
+# Import our strict schemas and base blueprint
+from src.schemas import RouterConfig, LLMResponse
+from src.providers.base import BaseLLMProvider
+# Set up logging for professional debugging
+logger = logging.getLogger(__name__)
+class OpenAIProvider(BaseLLMProvider):
+    """
+    The concrete implementation for OpenAI's API.
+    How it strictly fulfills the contract defined in BaseLLMProvider.
+    """
+    def __init__(self, api_key: str):
+        # Call the parent class initialization
+        super().__init__(api_key)
+        # CRITICAL: We initialize the ASYNC client, not the standard synchronous one.
+        # This is what allows our router to handle hundreds of concurrent requests.
+        self.client = AsyncOpenAI(api_key=self.api_key)
+    async def async_generate(self, prompt: str, config: RouterConfig) -> LLMResponse:
+        logger.info(f"Routing request to OpenAI using model: {config.model}")
+        # 1. Execute the Async API Call
+        response = await self.client.chat.completions.create(
+            model=config.model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=config.temperature,
+            # We will add top_p, frequency_penalty, etc. later as needed
+        )
+        # 2. Extract Data from OpenAI's specific object structure
+        content = response.choices[0].message.content
+        prompt_tokens = response.usage.prompt_tokens
+        completion_tokens = response.usage.completion_tokens
+        # 3. Calculate Cost dynamically
+        cost = self.calculate_cost(prompt_tokens, completion_tokens, config.model)
+        # 4. Standardize the Output
+        # We transform OpenAI's proprietary response into our universal LLMResponse schema.
+        # Now, the rest of our application doesn't need to know anything about OpenAI's specific formatting.
+        return LLMResponse(
+            content=content,
+            provider_used="openai",
+            model_used=config.model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            cost_estimate=cost
+        )
+    def calculate_cost(self, prompt_tokens: int, completion_tokens: int, model_name: str) -> float:
+        """
+        Calculates the exact cost of the API call based on OpenAI's pricing (per 1M tokens).
+        This is a massive value-add for your open-source repository.
+        """
+        # A dictionary acting as a simple pricing database
+        pricing = {
+            "gpt-4-turbo": {"prompt": 10.0, "completion": 30.0},
+            "gpt-4o": {"prompt": 5.0, "completion": 15.0},
+            "gpt-3.5-turbo": {"prompt": 0.5, "completion": 1.5}
+        }
+        # If they use a model not in our dict, default to 0.0 to prevent crashes
+        rates = pricing.get(model_name, {"prompt": 0.0, "completion": 0.0})
+        # Math: (tokens / 1,000,000) * rate
+        cost = (prompt_tokens / 1_000_000) * rates["prompt"] + \
+               (completion_tokens / 1_000_000) * rates["completion"]
+        return round(cost, 6)

src/rag/__pycache__/chatbot.cpython-312.pyc ADDED Viewed

Binary file (2.14 kB). View file

src/rag/__pycache__/ingestion.cpython-312.pyc ADDED Viewed

Binary file (1.66 kB). View file

src/rag/__pycache__/vector_store.cpython-312.pyc ADDED Viewed

Binary file (1.89 kB). View file

src/rag/chatbot.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from langchain_openai import ChatOpenAI
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+# from langchain.chains.history_aware_retriever import create_history_aware_retriever
+# from langchain.chains.retrieval import create_retrieval_chain
+# from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from src.rag.vector_store import get_vector_store
+def build_doc_assistant(api_key: str):
+    """
+    Constructs the conversational RAG pipeline.
+    """
+    # 1. Initialize our LLM (temperature=0 because we want factual answers, not creative ones)
+    llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo", temperature=0)
+    # 2. Connect to our Vector DB (k=2 means return the top 2 most relevant chunks)
+    retriever = get_vector_store(api_key).as_retriever(search_kwargs={"k": 2})
+    # ==========================================
+    # STEP 1: The "Question Reformulation" Prompt
+    # ==========================================
+    contextualize_q_system_prompt = (
+        "Given a chat history and the latest user question "
+        "which might reference context in the chat history, "
+        "formulate a standalone question which can be understood "
+        "without the chat history. Do NOT answer the question, "
+        "just reformulate it if needed and otherwise return it as is."
+    )
+    contextualize_q_prompt = ChatPromptTemplate.from_messages([
+        ("system", contextualize_q_system_prompt),
+        MessagesPlaceholder("chat_history"), # Injects our memory here
+        ("human", "{input}"),
+    ])
+    # This chain automatically handles rewriting the query before searching
+    history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
+    # ==========================================
+    # STEP 2: The "Final Answer" Prompt
+    # ==========================================
+    system_prompt = (
+        "You are an elite AI Engineering Assistant. "
+        "Use the following pieces of retrieved context to answer the question. "
+        "If the answer is not contained in the context, say 'I don't know based on the documentation.' "
+        "Do not make up an answer. Keep it concise.\n\n"
+        "Context: {context}"
+    )
+    qa_prompt = ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ])
+    # This chain handles injecting the retrieved chunks into the {context} variable
+    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+    # ==========================================
+    # STEP 3: Tie it all together
+    # ==========================================
+    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+    return rag_chain

src/rag/ingestion.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+def chunk_document_text(raw_text: str):
+    """
+    Simulates taking a massive document and chunking it for a Vector Store.
+    """
+    print(f"Original Document Length: {len(raw_text)} characters")
+    # THE CHUNKER CONFIGURATION
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=100,       # The maximum size of each chunk
+        chunk_overlap=20,     # How much the chunks should overlap
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""] # Tries to split at paragraphs first, then sentences
+    )
+    # Create a LangChain Document object
+    doc = Document(page_content=raw_text, metadata={"source": "engineering_manual.pdf"})
+    # Execute the split
+    chunks = text_splitter.split_documents([doc])
+    print(f"\nCreated {len(chunks)} chunks.")
+    # Let's inspect the exact output to understand the data structure
+    for i, chunk in enumerate(chunks):
+        print(f"\n--- Chunk {i+1} ---")
+        print(chunk.page_content)
+    return chunks
+# Let's test it with a sample "manual"
+if __name__ == "__main__":
+    sample_manual = (
+        "OmniRouter is an advanced asynchronous LLM routing engine. "
+        "It is designed to handle multiple providers gracefully. "
+        "If the primary provider fails, the system initiates a failover protocol. "
+        "This ensures maximum uptime for production systems."
+    )
+    chunk_document_text(sample_manual)

src/rag/vector_store.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+from typing import List
+from langchain_core.documents import Document
+from langchain_community.vectorstores import Chroma
+# OpenAI embedding
+# from langchain_openai import OpenAIEmbeddings
+# Free local embedding
+from langchain_huggingface import HuggingFaceEmbeddings
+from dotenv import load_dotenv
+load_dotenv()
+# # Huggingface api key...
+# os.environ["HF_TOKEN"] = "hf_PWDT"
+# This is where our local database will be saved on your hard drive
+DB_DIRECTORY = "./chroma_db"
+def get_embeddings_model():
+    """Returns the active embedding model."""
+    # --- FREE PIPELINE ---
+    # This downloads a small, highly efficient open-source model to your machine.
+    print("Loading HuggingFace Embeddings...")
+    # api_key = os.getenv("HUGGINGFACE_API_KEY")
+    return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    # --- PAID PIPELINE (Uncomment when you have credits) ---
+    # We use OpenAI's embedding model here. It converts text to 1536-dimensional vectors.
+    # api_key = os.getenv("OPENAI_API_KEY")
+    # return OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")
+def build_vector_store(chunks: List[Document], api_key: str):
+    """
+    Takes a list of chunked documents, embeds them, and saves them to a local Chroma database.
+    """
+    embeddings = get_embeddings_model()
+    print(f"Embedding {len(chunks)} chunks and saving to {DB_DIRECTORY}...")
+    # 1. Create the database
+    # 2. Embed all the chunks
+    # 3. Save it to the DB_DIRECTORY
+    vector_store = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings,
+        persist_directory=DB_DIRECTORY
+    )
+    # Force the database to save to disk
+    vector_store.persist()
+    print("Database successfully built and saved to disk!")
+    return vector_store
+def get_vector_store(api_key: str):
+    """
+    Retrieves the existing database from the hard drive so we don't have to rebuild it every time.
+    """
+    embeddings = get_embeddings_model()
+    return Chroma(persist_directory=DB_DIRECTORY, embedding_function=embeddings)

src/router.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import asyncio
+import logging
+from typing import Dict, Any
+# Import our schemas and providers
+from src.schemas import RouterConfig, LLMResponse
+from src.providers.base import BaseLLMProvider
+from src.providers.openai_client import OpenAIProvider
+from src.providers.anthropic_client import AnthropicProvider
+# Set up logging so we can see the retries happening in the terminal
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class OmniRouter:
+    """
+    The central routing engine.
+    Handles provider selection, retries, and error management.
+    """
+    def __init__(self, api_keys: Dict[str, str]):
+        """
+        We initialize the router with a dictionary of API keys.
+        We then map string names (like 'openai') to their concrete class instances.
+        """
+        self.providers: Dict[str, BaseLLMProvider] = {}
+        # If the user passed an OpenAI key, activate the OpenAI provider
+        if "openai" in api_keys:
+            self.providers["openai"] = OpenAIProvider(api_key=api_keys["openai"])
+        #---Register Anthropic
+        if "anthropic" in api_keys:
+            self.providers["anthropic"] = AnthropicProvider(api_key=api_keys["anthropic"])
+        # We will add others here later
+    async def generate(self, prompt: str, config: RouterConfig) -> LLMResponse:
+        """
+        The main entry point. Routes the prompt to the correct provider with retries.
+        """
+        # 1. Check if the requested provider actually exists in our dictionary
+        provider = self.providers.get(config.provider)
+        if not provider:
+            raise ValueError(f"Provider '{config.provider}' is not configured.")
+        last_exception = None
+        # 2. PRIMARY RETRY LOOP
+        for attempt in range(config.max_retries):
+            try:
+                # If this is a retry, log it
+                if attempt > 0:
+                    logger.info(f"[{config.provider}] Retrying... Attempt {attempt + 1} of {config.max_retries}")
+                # 3. The actual API call to whatever provider is currently selected
+                response = await provider.async_generate(prompt, config)
+                return response
+            except Exception as e:
+                # If the API crashes, we catch it here instead of crashing the app
+                logger.warning(f"[{config.provider}] Attempt {attempt + 1} failed with error: {str(e)}")
+                last_exception = e
+                # 4. EXPONENTIAL BACKOFF
+                # Wait 2^attempt seconds (1s, 2s, 4s, 8s...) before trying again
+                wait_time = 2 ** attempt
+                logger.info(f"Waiting {wait_time} seconds before next attempt...")
+                await asyncio.sleep(wait_time)
+        # 2. FAILOVER LOGIC (The Holy Grail)
+        if config.fallback_provider:
+            logger.error(f"🚨 Primary provider '{config.provider}' exhausted all retries. Initiating FAILOVER to '{config.fallback_provider}'...")
+            # Create a new config for the fallback provider
+            fallback_config = RouterConfig(
+                provider=config.fallback_provider,
+                model=config.fallback_model or config.model, # Use specific fallback model if provided
+                temperature=config.temperature,
+                max_retries=config.max_retries
+            )
+            # Recursively call generate with the new config!
+            return await self.generate(prompt, fallback_config)
+        # 5. If we loop through all max_retries and still fail, crash gracefully
+        logger.error(f"All {config.max_retries} attempts failed and no fallback configured.")
+        raise last_exception

src/schemas.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+In LLMs, you are querying a probabilistic text engine.
+If you ask it for an age, it might give you 25, or it might give you "Twenty-five", or it might say "Based on the data, the user is 25 years old."
+If your system expects 25 but gets a whole sentence, your code crashes in production.
+**Schemas act as the bouncers at the door of your application. We use Pydantic to define the exact shape of the data we expect.**
+"""
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional
+# 1. THE CONFIGURATION SCHEMA
+# This dictates how our router behaves. Notice how we set smart defaults.
+class RouterConfig(BaseModel):
+    """Configuration for how the OmniRouter should route the request."""
+    provider: str = Field(
+        default="openai",
+        description="The LLM provider to use (e.g., 'openai', 'anthropic', 'local')"
+    )
+    model: str = Field(
+        default="gpt-4-turbo",
+        description="The specific model string to use"
+    )
+    # Defensive Engineering: An LLM temperature cannot be less than 0 or greater than 2.
+    # ge = greater than or equal to, le = less than or equal to.
+    temperature: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=2.0,
+        description="Creativity score for the model"
+    )
+    max_retries: int = Field(
+        default=3,
+        description="How many times to retry on API failure or rate limit"
+    )
+    # --- NEW CAPABILITY --- Added
+    fallback_provider: Optional[str] = Field(
+        default=None,
+        description="If the primary provider completely fails, switch to this one"
+    )
+    fallback_model: Optional[str] = Field(
+        default=None,
+        description="The model to use for the fallback provider"
+    )
+# 2. THE STANDARDIZED OUTPUT SCHEMA
+# This solves the main pain point from our README.
+# Whether OpenAI or Anthropic answers, the rest of our app gets THIS exact object.
+class LLMResponse(BaseModel):
+    """The standardized output format returned from ANY provider."""
+    content: str = Field(description="The actual text response generated by the LLM")
+    provider_used: str = Field(description="Which provider actually generated this response")
+    model_used: str = Field(description="The specific model used")
+    # We track tokens heavily for cost optimization (Week 14 concept)
+    prompt_tokens: int = Field(default=0, description="Tokens used in the prompt")
+    completion_tokens: int = Field(default=0, description="Tokens used in the completion")
+    # We will calculate this automatically later
+    cost_estimate: float = Field(default=0.0, description="Estimated cost of this call in USD")