Spaces:

Naveen-2007
/

perplexity-clone

Running

App Files Files Community

Naveen-2007 commited on Dec 3, 2025

Commit

b02630d

0 Parent(s):

Perplexity AI Clone - Full Production Version with 8 Modes

Browse files

Files changed (39) hide show

.gitignore +10 -0
.python-version +1 -0
README.md +0 -0
app/__init__.py +0 -0
app/api.py +984 -0
config/__init__.py +0 -0
config/config.py +30 -0
config/system_prompt.py +11 -0
document_processing/__init__.py +0 -0
document_processing/processor.py +26 -0
embeddings/__init__.py +0 -0
embeddings/embedder.py +24 -0
files/__init__.py +1 -0
files/file_manager.py +99 -0
main.py +6 -0
pyproject.toml +32 -0
rag/__init__.py +0 -0
rag/agents.py +803 -0
rag/graph_deep.py +285 -0
rag/rag_state.py +95 -0
rag/router.py +113 -0
requirements.txt +32 -0
streamlit_app.py +709 -0
tools/__init__.py +0 -0
tools/browse_tool.py +18 -0
tools/citation_tool.py +19 -0
tools/followup_tool.py +33 -0
tools/image_tavily.py +55 -0
tools/knowledge_panel.py +66 -0
tools/memory_tool.py +34 -0
tools/name_extractor.py +9 -0
tools/name_tool.py +28 -0
tools/reranker_tool.py +18 -0
tools/search_tool.py +22 -0
tools/summarizer_tool.py +27 -0
tools/wiki_tool.py +14 -0
uv.lock +0 -0
vectorstore/__init__.py +0 -0
vectorstore/store.py +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

File without changes

app/__init__.py ADDED Viewed

File without changes

app/api.py ADDED Viewed

	@@ -0,0 +1,984 @@

+# ===================== api.py ==========================
+from typing import List, Dict
+from pathlib import Path
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from config.config import Config
+from config.system_prompt import PPLX_SYSTEM_PROMPT
+# Core routing / graph
+from rag.router import RouterAgent
+from rag.graph_deep import (
+    DeepResearchGraph,
+    WebSearchGraph,
+    RAGOnlyGraph,
+    AgenticRAGGraph,
+    AnalysisGraph,
+    SummarizeGraph
+)
+# Tools
+from tools.memory_tool import MemoryTool
+from tools.name_tool import NameTool
+from tools.search_tool import SearchTool
+from tools.browse_tool import BrowseTool
+from tools.reranker_tool import Reranker
+from tools.followup_tool import FollowUpGenerator
+from tools.image_tavily import TavilyImageSearch
+from tools.knowledge_panel import KnowledgePanel
+from tools.summarizer_tool import SummarizerTool
+# RAG pipeline
+from document_processing.processor import DocumentProcessor
+from vectorstore.store import VectorStore
+# File Manager for per-workspace RAG
+from files.file_manager import FileManager
+# =======================================================
+# FastAPI App
+# =======================================================
+app = FastAPI(title="Perplexity Clone API", version="8.0 - Production LangGraph")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_headers=["*"],
+    allow_methods=["*"],
+    allow_credentials=True,
+)
+# =======================================================
+# Global Components
+# =======================================================
+llm = Config.get_llm()
+router = RouterAgent()
+memory = MemoryTool()
+name_tool = NameTool()
+followup = FollowUpGenerator()
+search_tool = SearchTool()
+browse_tool = BrowseTool()
+reranker = Reranker()
+image_search = TavilyImageSearch()
+knowledge_panel = KnowledgePanel()
+summarizer = SummarizerTool()
+# RAG demo vectorstore
+processor = DocumentProcessor(
+    chunk_size=Config.CHUNK_SIZE,
+    chunk_overlap=Config.CHUNK_OVERLAP,
+)
+demo_docs = processor.load_url("https://lilianweng.github.io/posts/2023-06-23-agent/")
+demo_splits = processor.split(demo_docs)
+vector = VectorStore()
+vector.create(demo_splits)
+# File manager for per-workspace document RAG
+file_manager = FileManager(base_dir="workspace_data")
+# =======================================================
+# Initialize All LangGraph Pipelines
+# =======================================================
+deep_graph = DeepResearchGraph(vector)
+web_graph = WebSearchGraph()
+rag_graph = RAGOnlyGraph(file_manager)
+agentic_graph = AgenticRAGGraph(file_manager, vector, image_search)
+analysis_graph = AnalysisGraph()
+summarize_graph = SummarizeGraph()
+print("✅ All LangGraph pipelines initialized!")
+# =======================================================
+# Models
+# =======================================================
+class ChatRequest(BaseModel):
+    message: str
+    workspace_id: str = "default"
+class ChatResponse(BaseModel):
+    answer: str
+    sources: List[Dict[str, str]] = []
+    links: List[Dict[str, str]] = []
+    images: List[Dict[str, str]] = []
+    followups: List[str] = []
+    default_tab: str = "answer"  # "answer" | "links" | "images"
+    workspace_id: str
+# =======================================================
+# Utils
+# =======================================================
+def build_context(ws: str, new_msg: str):
+    """Inject full workspace chat history + system prompt."""
+    messages = [{"role": "system", "content": PPLX_SYSTEM_PROMPT}]
+    for msg in memory.get_long_chat(ws):
+        messages.append({"role": msg["role"], "content": msg["content"]})
+    messages.append({"role": "user", "content": new_msg})
+    return messages
+def guess_default_tab(query: str, mode: str) -> str:
+    """Decide which UI tab should be first (Answer / Links / Images)."""
+    q = query.lower()
+    image_words = [
+        "image", "images", "photo", "photos", "picture", "pictures",
+        "wallpaper", "logo", "flag", "screenshot", "pic"
+    ]
+    if any(w in q for w in image_words):
+        return "images"
+    if mode == "web":
+        return "links"
+    return "answer"
+def tavily_images_safe(query: str) -> List[Dict[str, str]]:
+    """Safe wrapper around Tavily image search."""
+    try:
+        return image_search.search(query, count=6)
+    except Exception as e:
+        print("Tavily image search error:", e)
+        return []
+def convert_links(results: List[Dict]) -> List[Dict[str, str]]:
+    """Convert Tavily web search results to link objects."""
+    links = []
+    for r in results:
+        url = r.get("url")
+        if not url:
+            continue
+        links.append(
+            {
+                "title": r.get("title", "Result"),
+                "url": url,
+                "snippet": (r.get("content") or "")[:200],
+            }
+        )
+    return links
+# =======================================================
+# Chat Endpoint
+# =======================================================
+@app.post("/api/chat", response_model=ChatResponse)
+def chat(req: ChatRequest):
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    # -------- Name memory special cases --------
+    extracted = name_tool.extract_name(q)
+    if extracted:
+        memory.set_name(ws, extracted)
+        reply = f"Nice to meet you, {extracted}! I’ll remember your name."
+        memory.add(ws, "assistant", reply)
+        return ChatResponse(answer=reply, workspace_id=ws)
+    if q.lower() in ["tell me my name", "what is my name"]:
+        nm = memory.get_name(ws)
+        ans = f"Your name is {nm} 😊" if nm else "You haven’t told me your name yet."
+        memory.add(ws, "assistant", ans)
+        return ChatResponse(answer=ans, workspace_id=ws)
+    # -------- Routing --------
+    mode = router.route(q)
+    default_tab = guess_default_tab(q, mode)
+    answer = ""
+    links: List[Dict[str, str]] = []
+    sources: List[Dict[str, str]] = []
+    follow: List[str] = []
+    # -------- LLM Mode (chat / creative / small talk) --------
+    if mode == "llm":
+        msgs = build_context(ws, q)
+        answer = llm.invoke(msgs).content
+        follow = followup.generate(answer, q)
+        # Optional small set of links
+        try:
+            res = search_tool.search(q, num_results=3)
+            links = convert_links(res)
+        except Exception as e:
+            print("search error (llm mode):", e)
+    # -------- Image Mode (image search queries) --------
+    elif mode == "image":
+        # For image queries, provide brief context + focus on images tab
+        try:
+            res = search_tool.search(q, num_results=3)
+            ctx = res[0].get("snippet", "") if res else ""
+            answer = f"Here are images related to '{q}'."
+            if ctx:
+                answer += f"\n\n{ctx}"
+            links = convert_links(res)
+        except Exception as e:
+            print("search error (image mode):", e)
+            answer = f"Showing images for: {q}"
+        follow = []
+    # -------- AGENTIC RAG Mode (files + web + images + knowledge) --------
+    elif mode == "rag":
+        # PLANNER AGENT: Decide which agents to activate
+        q_lower = q.lower()
+        use_file_rag = any(
+            w in q_lower for w in [
+                "summarize", "according to", "in this pdf", "in the document",
+                "based on the file", "read my", "extract from", "uploaded",
+                "this file", "the file", "my file", "from file"
+            ]
+        ) or len(q.split()) > 2  # multi-word questions likely need file RAG
+        use_web = any(
+            w in q_lower for w in [
+                "today", "latest", "current", "news", "stock", "price",
+                "real-time", "weather", "who is", "what is", "where is",
+                "when", "how much", "compare"
+            ]
+        )
+        use_images = any(
+            w in q_lower for w in [
+                "image", "images", "logo", "flag", "photos", "look like",
+                "picture", "show me", "wallpaper", "screenshot"
+            ]
+        )
+        # FILE AGENT: Retrieve from workspace uploaded docs
+        ws_obj = file_manager.get_workspace(ws)
+        file_chunks = []
+        if use_file_rag and ws_obj.initialized:
+            file_chunks = ws_obj.retrieve(q, k=6)
+        # REFERENCE AGENT: Retrieve from base vector store (demo docs)
+        base_chunks = vector.retrieve(q, k=4)
+        base_chunks = reranker.rerank(q, base_chunks, top_k=3)
+        # WEB AGENT: Fetch live web content
+        web_pages = []
+        web_results = []
+        if use_web:
+            try:
+                web_results = search_tool.search(q, num_results=4)
+                for r in web_results:
+                    url = r.get("url")
+                    if not url:
+                        continue
+                    text = browse_tool.fetch_clean(url)
+                    if text:
+                        web_pages.append({
+                            "title": r.get("title", ""),
+                            "url": url,
+                            "content": text[:1500]  # Speed optimization
+                        })
+            except Exception as e:
+                print(f"Web agent error: {e}")
+        # IMAGE AGENT: Fetch relevant images
+        images_result = tavily_images_safe(q) if use_images else []
+        # BUILD COMBINED CONTEXT
+        contexts = []
+        if file_chunks:
+            file_ctx = "\n\n".join(d.page_content for d in file_chunks)
+            contexts.append(f"📄 FILE CONTEXT (from uploaded documents):\n{file_ctx}")
+        if base_chunks:
+            ref_ctx = "\n\n".join(d.page_content for d in base_chunks)
+            contexts.append(f"📚 REFERENCE CONTEXT:\n{ref_ctx}")
+        if web_pages:
+            web_ctx = "\n\n".join(f"[{p['title']}]: {p['content']}" for p in web_pages)
+            contexts.append(f"🌐 WEB CONTEXT (live web data):\n{web_ctx}")
+        full_context = "\n\n-----\n\n".join(contexts) if contexts else "No context available."
+        # SYNTHESIZER AGENT: Generate final answer
+        synth_prompt = f"""You are an AGENTIC RAG synthesis model like Perplexity AI.
+Combine information from FILE CONTEXT, REFERENCE CONTEXT and WEB CONTEXT.
+RULES:
+1. PRIORITIZE info from FILE CONTEXT (user's uploaded documents) when available.
+2. Use WEB CONTEXT to add current/live information.
+3. Use REFERENCE CONTEXT for background knowledge.
+4. Cite sources using [1], [2], etc. when referencing specific info.
+5. If answering from a file, say "According to your uploaded document..."
+6. Do NOT hallucinate - only use info from the provided contexts.
+7. Be concise but comprehensive.
+AVAILABLE CONTEXT:
+{full_context}
+USER QUESTION: {q}
+FINAL ANSWER:"""
+        msgs = build_context(ws, synth_prompt)
+        answer = llm.invoke(msgs).content
+        follow = followup.generate(answer, q)
+        # BUILD SOURCES
+        sources = []
+        if file_chunks:
+            for d in file_chunks:
+                sources.append({
+                    "title": d.metadata.get("source", "📄 Uploaded File"),
+                    "url": d.metadata.get("file_path", "")
+                })
+        if web_pages:
+            for p in web_pages:
+                sources.append({"title": p["title"], "url": p["url"]})
+        # BUILD LINKS
+        links = convert_links(web_results)
+        # Set images from image agent
+        if images_result:
+            # Will be set at the end with tavily_images_safe
+            pass
+    # -------- Web Mode (real-time / entities / news) --------
+    elif mode == "web":
+        res = search_tool.search(q, num_results=5)
+        pages = []
+        for r in res:
+            url = r.get("url")
+            if not url:
+                continue
+            text = browse_tool.fetch_clean(url)
+            if not text:
+                continue
+            pages.append(
+                {
+                    "title": r.get("title", "Webpage"),
+                    "url": url,
+                    "content": text[:2000],
+                }
+            )
+        ctx = "\n\n".join(p["content"] for p in pages)
+        prompt = (
+            "Use ONLY the following web content to answer. "
+            "Cite sources using [1], [2], etc.\n\n"
+            f"{ctx}\n\nQuestion: {q}"
+        )
+        msgs = build_context(ws, prompt)
+        answer = llm.invoke(msgs).content
+        follow = followup.generate(answer, q)
+        links = [
+            {
+                "title": p["title"],
+                "url": p["url"],
+                "snippet": p["content"][:200],
+            }
+            for p in pages
+        ]
+        sources = [{"title": p["title"], "url": p["url"]} for p in pages]
+    # -------- Fallback → LLM --------
+    else:
+        msgs = build_context(ws, q)
+        answer = llm.invoke(msgs).content
+        follow = followup.generate(answer, q)
+    # -------- Images (for Images tab) --------
+    images = tavily_images_safe(q)
+    # Debug logging
+    print(f"\n=== API Response Debug ===")
+    print(f"Mode: {mode}")
+    print(f"Links count: {len(links)}")
+    print(f"Images count: {len(images)}")
+    print(f"Sources count: {len(sources)}")
+    if links:
+        print(f"First link: {links[0]}")
+    if images:
+        print(f"First image: {images[0]}")
+    print(f"========================\n")
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=links,
+        images=images,
+        followups=follow,
+        default_tab=default_tab,
+        workspace_id=ws,
+    )
+# =======================================================
+# Streaming Endpoint
+# =======================================================
+@app.post("/api/chat/stream")
+def chat_stream(req: ChatRequest):
+    q = req.message
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    msgs = build_context(ws, q)
+    def generate():
+        full = ""
+        for chunk in llm.stream(msgs):
+            tok = getattr(chunk, "content", "")
+            if tok:
+                full += tok
+                yield tok
+        memory.add(ws, "assistant", full)
+    return StreamingResponse(generate(), media_type="text/plain")
+# =======================================================
+# Deep Research Endpoint
+# =======================================================
+@app.post("/api/deep_research", response_model=ChatResponse)
+def deep_research(req: ChatRequest):
+    q = req.message
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    try:
+        state = deep_graph.run(q)
+        # state is a dict (TypedDict), not an object
+        answer = state.get("final_answer", "No answer generated.")
+        sources = state.get("sources", [])
+    except Exception as e:
+        print("Deep research error:", e)
+        answer = "Something went wrong in deep research mode."
+        sources = []
+    memory.add(ws, "assistant", answer)
+    images = tavily_images_safe(q)
+    follow = followup.generate(answer, q)
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=[],
+        images=images,
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws,
+    )
+# =======================================================
+# Knowledge Panel Endpoint
+# =======================================================
+@app.get("/api/knowledge_panel")
+def get_knowledge_panel(q: str):
+    """
+    Returns Wikipedia-style infobox + AI-generated facts.
+    Used by UI to render a sidebar knowledge card.
+    """
+    try:
+        panel = knowledge_panel.build_panel(q)
+        return panel
+    except Exception as e:
+        print("Knowledge panel error:", e)
+        return {"wiki": {}, "facts": []}
+# =======================================================
+# FILE UPLOAD (PDF / TXT / PPTX) - Perplexity Spaces Feature
+# =======================================================
+@app.post("/api/upload_docs")
+async def upload_docs(
+    workspace_id: str = Form("default"),
+    files: List[UploadFile] = File(...)
+):
+    """
+    Upload one or more documents and index them for this workspace.
+    Supports PDF, TXT, MD, PPT, PPTX files.
+    """
+    ws = file_manager.get_workspace(workspace_id)
+    saved_paths = []
+    for f in files:
+        ext = Path(f.filename).suffix.lower()
+        if ext not in [".pdf", ".txt", ".md", ".ppt", ".pptx"]:
+            continue  # skip unsupported types
+        dest = Path(ws.base_dir) / f.filename
+        with open(dest, "wb") as out:
+            content = await f.read()
+            out.write(content)
+        saved_paths.append(dest)
+    if saved_paths:
+        ws.add_files(saved_paths)
+        print(f"✅ Indexed {len(saved_paths)} files for workspace '{workspace_id}'")
+    return {
+        "workspace_id": workspace_id,
+        "files": ws.files,
+        "count": len(ws.files),
+        "message": f"Successfully indexed {len(saved_paths)} files"
+    }
+@app.get("/api/workspace_files/{workspace_id}")
+def get_workspace_files(workspace_id: str):
+    """Get list of files uploaded to a workspace."""
+    ws = file_manager.get_workspace(workspace_id)
+    return {
+        "workspace_id": workspace_id,
+        "files": ws.files,
+        "initialized": ws.initialized
+    }
+@app.delete("/api/workspace/{workspace_id}")
+def clear_workspace(workspace_id: str):
+    """Clear all files from a workspace."""
+    file_manager.clear_workspace(workspace_id)
+    return {"message": f"Workspace '{workspace_id}' cleared"}
+# =======================================================
+# MODE-SPECIFIC ENDPOINTS
+# =======================================================
+class ModeRequest(BaseModel):
+    message: str
+    workspace_id: str = "default"
+    mode: str = "auto"
+@app.post("/api/focus", response_model=ChatResponse)
+def focus_mode(req: ModeRequest):
+    """Focus mode - concise, direct answers without web search."""
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    prompt = f"""You are in FOCUS mode. Provide a concise, direct answer.
+- No unnecessary elaboration
+- Get straight to the point
+- Use bullet points if helpful
+- Be accurate and helpful
+Question: {q}
+Answer:"""
+    msgs = build_context(ws, prompt)
+    answer = llm.invoke(msgs).content
+    follow = followup.generate(answer, q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=[],
+        links=[],
+        images=[],
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/writing", response_model=ChatResponse)
+def writing_mode(req: ModeRequest):
+    """Writing mode - creative writing, essays, content generation."""
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    prompt = f"""You are in WRITING mode - a creative writing assistant.
+Help with:
+- Essays, articles, blog posts
+- Creative writing, stories
+- Professional emails and documents
+- Content improvement and editing
+- Grammar and style suggestions
+Be creative, engaging, and helpful. Format your response well.
+Request: {q}
+Response:"""
+    msgs = build_context(ws, prompt)
+    answer = llm.invoke(msgs).content
+    follow = followup.generate(answer, q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=[],
+        links=[],
+        images=[],
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/math", response_model=ChatResponse)
+def math_mode(req: ModeRequest):
+    """Math mode - mathematical calculations and explanations."""
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    prompt = f"""You are in MATH mode - a mathematical assistant.
+- Solve mathematical problems step by step
+- Show all work and calculations
+- Explain the reasoning
+- Use proper mathematical notation
+- Handle algebra, calculus, statistics, geometry, etc.
+Problem: {q}
+Solution:"""
+    msgs = build_context(ws, prompt)
+    answer = llm.invoke(msgs).content
+    follow = followup.generate(answer, q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=[],
+        links=[],
+        images=[],
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/code", response_model=ChatResponse)
+def code_mode(req: ModeRequest):
+    """Code mode - programming help and code generation."""
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    prompt = f"""You are in CODE mode - an expert programming assistant.
+- Write clean, efficient, well-commented code
+- Explain the code logic
+- Follow best practices
+- Handle any programming language
+- Debug and fix code issues
+- Suggest improvements
+Request: {q}
+Response:"""
+    msgs = build_context(ws, prompt)
+    answer = llm.invoke(msgs).content
+    follow = followup.generate(answer, q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=[],
+        links=[],
+        images=[],
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/analyze", response_model=ChatResponse)
+def analyze_mode(req: ModeRequest):
+    """
+    Analysis mode - deep analysis with web research.
+    Production-level LangGraph implementation.
+    """
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    # Run the AnalysisGraph pipeline
+    state = analysis_graph.run(q)
+    answer = state.get("answer", "No analysis generated.")
+    sources = state.get("sources", [])
+    links = state.get("links", [])
+    follow = state.get("followups", [])
+    # Get related images
+    images = tavily_images_safe(q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=links,
+        images=images,
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/summarize", response_model=ChatResponse)
+def summarize_mode(req: ModeRequest):
+    """
+    Summarize mode - summarize uploaded documents OR web content.
+    Prioritizes uploaded files, then falls back to web search.
+    """
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    # STEP 1: Check for uploaded files first
+    ws_obj = file_manager.get_workspace(ws)
+    if ws_obj.initialized and ws_obj.files:
+        # Summarize from uploaded files
+        print(f"📝 SUMMARIZE MODE: Using uploaded files")
+        try:
+            # Retrieve relevant chunks from files
+            chunks = ws_obj.retrieve(q, k=10)
+            if chunks:
+                # Combine chunk content for summarization
+                content = "\n\n".join([c.page_content for c in chunks])
+                # Generate summary
+                summary = summarizer.summarize(content, max_words=400)
+                # Build sources from files
+                seen_files = set()
+                sources = []
+                for c in chunks:
+                    fname = c.metadata.get("source", "Document")
+                    if fname not in seen_files:
+                        sources.append({"title": f"📄 {fname}", "url": ""})
+                        seen_files.add(fname)
+                follow = followup.generate(summary, q)
+                memory.add(ws, "assistant", summary)
+                return ChatResponse(
+                    answer=summary,
+                    sources=sources,
+                    links=[],
+                    images=[],
+                    followups=follow,
+                    default_tab="answer",
+                    workspace_id=ws
+                )
+        except Exception as e:
+            print(f"  ❌ File summarize error: {e}")
+    # STEP 2: Check if it's a URL
+    if q.startswith("http"):
+        print(f"📝 SUMMARIZE MODE: URL detected")
+        try:
+            content = browse_tool.fetch_clean(q)
+            if content:
+                summary = summarizer.summarize(content, max_words=400)
+                sources = [{"title": "Source URL", "url": q}]
+                links = [{"title": "Source", "url": q, "snippet": content[:200]}]
+                follow = followup.generate(summary, q)
+                memory.add(ws, "assistant", summary)
+                return ChatResponse(
+                    answer=summary,
+                    sources=sources,
+                    links=links,
+                    images=[],
+                    followups=follow,
+                    default_tab="answer",
+                    workspace_id=ws
+                )
+        except Exception as e:
+            print(f"  ❌ URL fetch error: {e}")
+    # STEP 3: Fall back to web search and summarize
+    print(f"📝 SUMMARIZE MODE: Web search fallback")
+    try:
+        results = search_tool.search(q, num_results=3)
+        content_parts = []
+        links = []
+        for r in results:
+            url = r.get("url", "")
+            title = r.get("title", "")
+            text = browse_tool.fetch_clean(url)
+            if text:
+                content_parts.append(text[:1500])
+                links.append({"title": title, "url": url, "snippet": text[:150]})
+        if content_parts:
+            combined = "\n\n".join(content_parts)
+            summary = summarizer.summarize(combined, max_words=400)
+        else:
+            summary = "Could not find content to summarize."
+        sources = [{"title": l["title"], "url": l["url"]} for l in links]
+        follow = followup.generate(summary, q)
+        memory.add(ws, "assistant", summary)
+        return ChatResponse(
+            answer=summary,
+            sources=sources,
+            links=links,
+            images=[],
+            followups=follow,
+            default_tab="answer",
+            workspace_id=ws
+        )
+    except Exception as e:
+        print(f"  ❌ Summarize error: {e}")
+        return ChatResponse(
+            answer=f"Error generating summary: {str(e)}",
+            sources=[],
+            links=[],
+            images=[],
+            followups=[],
+            default_tab="answer",
+            workspace_id=ws
+        )
+# =======================================================
+# PRODUCTION-LEVEL MODE ENDPOINTS
+# =======================================================
+@app.post("/api/web", response_model=ChatResponse)
+def web_search_mode(req: ModeRequest):
+    """
+    Web Search Mode - Real-time web search with source citations.
+    Production-level LangGraph implementation.
+    """
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    # Run the WebSearchGraph pipeline
+    state = web_graph.run(q)
+    answer = state.get("answer", "No answer generated.")
+    sources = state.get("sources", [])
+    links = state.get("links", [])
+    follow = state.get("followups", [])
+    # Get images separately
+    images = tavily_images_safe(q)
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=links,
+        images=images,
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/rag", response_model=ChatResponse)
+def rag_mode(req: ModeRequest):
+    """
+    RAG Mode - Search uploaded documents only.
+    Production-level LangGraph implementation.
+    """
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    # Run the RAGOnlyGraph pipeline
+    state = rag_graph.run(q, ws)
+    answer = state.get("answer", "No answer generated.")
+    sources = state.get("sources", [])
+    follow = state.get("followups", [])
+    memory.add(ws, "assistant", answer)
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=[],
+        images=[],
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )
+@app.post("/api/agentic", response_model=ChatResponse)
+def agentic_mode(req: ModeRequest):
+    """
+    Agentic Mode - Multi-agent RAG with Planner, File, Web, Knowledge, Image, Synthesizer.
+    Production-level LangGraph implementation.
+    """
+    q = req.message.strip()
+    ws = req.workspace_id
+    memory.add(ws, "user", q)
+    print(f"\n🤖 AGENTIC MODE (LangGraph): {q}")
+    # Run the AgenticRAGGraph pipeline
+    state = agentic_graph.run(q, ws)
+    answer = state.get("answer", "No answer generated.")
+    sources = state.get("sources", [])
+    links = state.get("links", [])
+    images = state.get("images", [])
+    follow = state.get("followups", [])
+    memory.add(ws, "assistant", answer)
+    print(f"  ✅ AgenticGraph: Completed with {len(sources)} sources")
+    return ChatResponse(
+        answer=answer,
+        sources=sources,
+        links=links,
+        images=images,
+        followups=follow,
+        default_tab="answer",
+        workspace_id=ws
+    )

config/__init__.py ADDED Viewed

File without changes

config/config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+from dotenv import load_dotenv
+from langchain.chat_models import init_chat_model
+load_dotenv()
+class Config:
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+    LLM_MODEL = "groq:openai/gpt-oss-120b"
+    CHUNK_SIZE = 400
+    CHUNK_OVERLAP = 80
+    @classmethod
+    def get_llm(cls):
+        """Return chat LLM instance with tool calling disabled."""
+        if not cls.GROQ_API_KEY:
+            raise RuntimeError("GROQ_API_KEY missing in .env")
+        os.environ["GROQ_API_KEY"] = cls.GROQ_API_KEY
+        llm = init_chat_model(cls.LLM_MODEL)
+        # Disable tool calling by binding empty tools list
+        try:
+            return llm.bind(tools=[])
+        except:
+            # Fallback if bind doesn't work
+            return llm

config/system_prompt.py ADDED Viewed

	@@ -0,0 +1,11 @@

+PPLX_SYSTEM_PROMPT = """
+You are Perplexity AI.
+When user greets (hi, hello, hey), respond like a friendly assistant:
+Short, conversational, natural.
+Do NOT give definitions or grammar explanations unless user asks.
+Your tone: concise, helpful, modern.
+Always adapt style based on question.
+"""

document_processing/__init__.py ADDED Viewed

File without changes

document_processing/processor.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import List
+from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+class DocumentProcessor:
+    """Loads and splits documents into chunks for RAG."""
+    def __init__(self, chunk_size: int = 400, chunk_overlap: int = 80) -> None:
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+        )
+    def load_url(self, url: str) -> List[Document]:
+        return WebBaseLoader(url).load()
+    def load_pdf(self, file_path: str) -> List[Document]:
+        return PyPDFLoader(file_path).load()
+    def load_txt(self, file_path: str) -> List[Document]:
+        return TextLoader(file_path, encoding="utf-8").load()
+    def split(self, docs: List[Document]) -> List[Document]:
+        return self.splitter.split_documents(docs)

embeddings/__init__.py ADDED Viewed

File without changes

embeddings/embedder.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Embedding module using SentenceTransformer (free)."""
+from typing import List
+from sentence_transformers import SentenceTransformer
+from langchain_core.embeddings import Embeddings
+class Embedder(Embeddings):
+    """LangChain-compatible wrapper around SentenceTransformer embedding model."""
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents."""
+        return self.model.encode(texts).tolist()
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a single query text."""
+        return self.model.encode([text])[0].tolist()
+    def embed(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of texts into dense vectors."""
+        return self.embed_documents(texts)

files/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # files/__init__.py

files/file_manager.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# files/file_manager.py
+from typing import Dict, List
+from pathlib import Path
+import shutil
+from document_processing.processor import DocumentProcessor
+from vectorstore.store import VectorStore
+from langchain.schema import Document
+class FileWorkspace:
+    """
+    Holds vector store + metadata for one workspace's uploaded docs.
+    """
+    def __init__(self, workspace_id: str, base_dir: str = "workspace_data"):
+        self.workspace_id = workspace_id
+        self.base_dir = Path(base_dir) / workspace_id
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.processor = DocumentProcessor()
+        self.vector = VectorStore()
+        self.initialized = False
+        self.files: List[str] = []  # filenames
+    def add_files(self, uploaded_paths: List[Path]):
+        """
+        Index newly uploaded files into the workspace vector store.
+        """
+        docs: List[Document] = []
+        for p in uploaded_paths:
+            try:
+                if p.suffix.lower() == ".pdf":
+                    docs.extend(self.processor.load_pdf(str(p)))
+                elif p.suffix.lower() in [".txt", ".md"]:
+                    docs.extend(self.processor.load_txt(str(p)))
+                elif p.suffix.lower() in [".ppt", ".pptx"]:
+                    # Use UnstructuredPowerPointLoader if available
+                    try:
+                        from langchain_community.document_loaders import UnstructuredPowerPointLoader
+                        loader = UnstructuredPowerPointLoader(str(p))
+                        docs.extend(loader.load())
+                    except ImportError:
+                        # Fallback: read as binary and extract text
+                        print(f"UnstructuredPowerPointLoader not available for {p.name}")
+                        continue
+                self.files.append(p.name)
+            except Exception as e:
+                print(f"Error loading file {p.name}: {e}")
+                continue
+        if not docs:
+            return
+        # Add file path metadata
+        for doc in docs:
+            doc.metadata["file_path"] = str(p)
+            doc.metadata["source"] = p.name
+        chunks = self.processor.split(docs)
+        if not self.initialized:
+            self.vector.create(chunks)
+            self.initialized = True
+        else:
+            self.vector.store.add_documents(chunks)
+    def retrieve(self, query: str, k: int = 6):
+        if not self.initialized:
+            return []
+        return self.vector.retrieve(query, k=k)
+class FileManager:
+    """
+    Keeps a map: workspace_id -> FileWorkspace
+    """
+    def __init__(self, base_dir: str = "workspace_data"):
+        self.base_dir = base_dir
+        self._workspaces: Dict[str, FileWorkspace] = {}
+    def get_workspace(self, workspace_id: str) -> FileWorkspace:
+        if workspace_id not in self._workspaces:
+            self._workspaces[workspace_id] = FileWorkspace(workspace_id, self.base_dir)
+        return self._workspaces[workspace_id]
+    def clear_workspace(self, workspace_id: str):
+        ws_dir = Path(self.base_dir) / workspace_id
+        if ws_dir.exists():
+            shutil.rmtree(ws_dir)
+        if workspace_id in self._workspaces:
+            del self._workspaces[workspace_id]
+    def get_files(self, workspace_id: str) -> List[str]:
+        if workspace_id in self._workspaces:
+            return self._workspaces[workspace_id].files
+        return []

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from ai-clone!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[project]
+name = "ai-clone"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "beautifulsoup4>=4.14.3",
+    "faiss-cpu>=1.13.0",
+    "fastapi>=0.123.0",
+    "langchain>=0.3.0",
+    "langchain-community>=0.3.0",
+    "langchain-core>=0.3.0",
+    "langchain-groq>=0.2.0",
+    "langchain-openai>=0.2.0",
+    "langchain-text-splitters>=0.3.0",
+    "langgraph>=0.2.0",
+    "numpy>=2.3.5",
+    "pdfminer-six>=20251107",
+    "pinecone-client>=6.0.0",
+    "pydantic>=2.12.5",
+    "pypdf>=6.4.0",
+    "pypdf2>=3.0.1",
+    "python-dotenv>=1.2.1",
+    "requests>=2.32.5",
+    "sentence-transformers>=5.1.2",
+    "tavily-python>=0.7.13",
+    "tqdm>=4.67.1",
+    "trafilatura>=2.0.0",
+    "uvicorn[standard]>=0.38.0",
+    "wikipedia>=1.4.0",
+]

rag/__init__.py ADDED Viewed

File without changes

rag/agents.py ADDED Viewed

	@@ -0,0 +1,803 @@

+"""
+Production-Level Agents for Perplexity Clone
+=============================================
+Each agent is a node in LangGraph pipelines.
+Agents handle specific tasks and pass state to next nodes.
+"""
+from typing import List, Dict, Any
+from rag.rag_state import (
+    RAGState,
+    WebSearchState,
+    RAGOnlyState,
+    AgenticState,
+    AnalysisState,
+    SummarizeState
+)
+from config.config import Config
+from config.system_prompt import PPLX_SYSTEM_PROMPT
+from vectorstore.store import VectorStore
+from tools.search_tool import SearchTool
+from tools.browse_tool import BrowseTool
+from tools.reranker_tool import Reranker
+from tools.citation_tool import CitationTool
+from tools.summarizer_tool import SummarizerTool
+from tools.followup_tool import FollowUpGenerator
+# =============================================================================
+# DEEP RESEARCH AGENTS (Original)
+# =============================================================================
+class PlannerAgent:
+    """Decomposes question into sub-questions."""
+    def __init__(self) -> None:
+        self.llm = Config.get_llm()
+    def plan(self, state: RAGState) -> RAGState:
+        prompt = (
+            "Break the following question into 3-5 clear sub-questions.\n"
+            "Return them as a numbered list.\n\n"
+            f"{state['question']}"
+        )
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ])
+        lines = [l.strip("-• ").strip() for l in resp.content.splitlines() if l.strip()]
+        subqs: List[str] = []
+        for l in lines:
+            if l[0].isdigit() or len(lines) <= 5:
+                subqs.append(l)
+        state["sub_questions"] = subqs[:5]
+        return state
+class ResearchAgent:
+    """Collects evidence from local RAG + web search."""
+    def __init__(self, vector_store: VectorStore) -> None:
+        self.vs = vector_store
+        self.search_tool = SearchTool()
+        self.browse_tool = BrowseTool()
+        self.reranker = Reranker()
+    def research(self, state: RAGState) -> RAGState:
+        evidence: List[str] = []
+        pages_all: List[Dict] = []
+        for sq in state.get("sub_questions", []):
+            # Local RAG
+            docs = self.vs.retrieve(sq, k=8)
+            docs = self.reranker.rerank(sq, docs, top_k=4)
+            evidence.extend(d.page_content for d in docs)
+            # Web search + browse
+            results = self.search_tool.search(sq, num_results=3)
+            for r in results:
+                url = r.get("url")
+                title = r.get("title", "Web result")
+                if not url:
+                    continue
+                content = self.browse_tool.fetch_clean(url)
+                if not content:
+                    continue
+                pages_all.append({"title": title, "url": url, "content": content})
+                evidence.append(content[:1500])
+        state["web_pages"] = pages_all
+        state["evidence"] = evidence
+        return state
+class AggregatorAgent:
+    """Writes draft answers per sub-question from evidence."""
+    def __init__(self) -> None:
+        self.llm = Config.get_llm()
+    def aggregate(self, state: RAGState) -> RAGState:
+        drafts: List[str] = []
+        for sq in state.get("sub_questions", []):
+            context = "\n\n".join(state.get("evidence", [])[:12])
+            prompt = (
+                "Using the evidence below, answer the sub-question briefly and clearly.\n\n"
+                f"Evidence:\n{context}\n\nSub-question: {sq}"
+            )
+            resp = self.llm.invoke([
+                {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+                {"role": "user", "content": prompt},
+            ])
+            drafts.append(f"Sub-question: {sq}\n{resp.content}")
+        state["draft_answers"] = drafts
+        return state
+class WriterAgent:
+    """Writes final structured deep-research report."""
+    def __init__(self) -> None:
+        self.llm = Config.get_llm()
+    def write(self, state: RAGState) -> RAGState:
+        findings = "\n\n".join(state.get("draft_answers", []))
+        prompt = (
+            "You are Perplexity in deep research mode.\n"
+            "Write a structured answer with sections: Overview, Key Points, Details, Conclusion.\n"
+            "Use inline citations like [1], [2] where appropriate.\n\n"
+            f"Original question:\n{state['question']}\n\nFindings:\n{findings}"
+        )
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ])
+        state["final_answer"] = resp.content
+        return state
+class ValidatorAgent:
+    """Builds source list & (optionally) validates citations."""
+    def __init__(self) -> None:
+        self.citation_tool = CitationTool()
+    def validate_and_attach(self, state: RAGState) -> RAGState:
+        sources: List[Dict] = []
+        for p in state.get("web_pages", [])[:10]:
+            sources.append({"title": p["title"], "url": p["url"]})
+        used_sources = self.citation_tool.attach_sources(state.get("final_answer", ""), sources)
+        state["sources"] = used_sources
+        return state
+# =============================================================================
+# WEB SEARCH AGENTS
+# =============================================================================
+class WebSearchNode:
+    """Node 1: Execute web search query."""
+    def __init__(self):
+        self.search_tool = SearchTool()
+    def search(self, state: WebSearchState) -> WebSearchState:
+        query = state.get("query", "")
+        print(f"  🔍 WebSearchNode: Searching for '{query[:50]}...'")
+        try:
+            results = self.search_tool.search(query, num_results=6)
+            state["search_results"] = results
+        except Exception as e:
+            print(f"  ❌ WebSearchNode error: {e}")
+            state["search_results"] = []
+        return state
+class WebFetchNode:
+    """Node 2: Fetch and parse web pages."""
+    def __init__(self):
+        self.browse_tool = BrowseTool()
+    def fetch(self, state: WebSearchState) -> WebSearchState:
+        pages = []
+        links = []
+        for r in state.get("search_results", []):
+            url = r.get("url")
+            if not url:
+                continue
+            try:
+                content = self.browse_tool.fetch_clean(url)
+                if content:
+                    pages.append({
+                        "title": r.get("title", ""),
+                        "url": url,
+                        "content": content[:2500]
+                    })
+                    links.append({
+                        "title": r.get("title", ""),
+                        "url": url,
+                        "snippet": content[:200]
+                    })
+            except:
+                continue
+        print(f"  📄 WebFetchNode: Fetched {len(pages)} pages")
+        state["web_pages"] = pages
+        state["links"] = links
+        return state
+class WebContextNode:
+    """Node 3: Build context from fetched pages."""
+    def build_context(self, state: WebSearchState) -> WebSearchState:
+        pages = state.get("web_pages", [])
+        if pages:
+            context_parts = []
+            for i, p in enumerate(pages):
+                context_parts.append(f"[{i+1}] {p['title']}:\n{p['content']}")
+            state["context"] = "\n\n---\n\n".join(context_parts)
+        else:
+            state["context"] = ""
+        print(f"  📝 WebContextNode: Built context from {len(pages)} sources")
+        return state
+class WebAnswerNode:
+    """Node 4: Generate answer from context."""
+    def __init__(self):
+        self.llm = Config.get_llm()
+        self.followup = FollowUpGenerator()
+    def answer(self, state: WebSearchState) -> WebSearchState:
+        query = state.get("query", "")
+        context = state.get("context", "")
+        if context:
+            prompt = f"""You are a web search assistant like Perplexity AI.
+Use ONLY the following web sources to answer. Cite sources using [1], [2], etc.
+WEB SOURCES:
+{context}
+QUESTION: {query}
+Provide a comprehensive, well-cited answer:"""
+        else:
+            prompt = f"Answer this question: {query}"
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt}
+        ])
+        answer = resp.content
+        state["answer"] = answer
+        state["followups"] = self.followup.generate(answer, query)
+        # Build sources
+        sources = [{"title": p["title"], "url": p["url"]} for p in state.get("web_pages", [])]
+        state["sources"] = sources
+        print(f"  ✅ WebAnswerNode: Generated answer")
+        return state
+# =============================================================================
+# RAG-ONLY AGENTS
+# =============================================================================
+class RAGRetrieveNode:
+    """Node 1: Retrieve from uploaded documents."""
+    def __init__(self, file_manager):
+        self.file_manager = file_manager
+        self.reranker = Reranker()
+    def retrieve(self, state: RAGOnlyState) -> RAGOnlyState:
+        query = state.get("query", "")
+        ws_id = state.get("workspace_id", "default")
+        ws = self.file_manager.get_workspace(ws_id)
+        if not ws.initialized or not ws.files:
+            state["file_chunks"] = []
+            print(f"  📁 RAGRetrieveNode: No files in workspace")
+            return state
+        try:
+            chunks = ws.retrieve(query, k=8)
+            # Convert to dicts for state
+            state["file_chunks"] = [
+                {"content": c.page_content, "source": c.metadata.get("source", "Document")}
+                for c in chunks
+            ]
+            print(f"  📁 RAGRetrieveNode: Retrieved {len(chunks)} chunks")
+        except Exception as e:
+            print(f"  ❌ RAGRetrieveNode error: {e}")
+            state["file_chunks"] = []
+        return state
+class RAGContextNode:
+    """Node 2: Build context from retrieved chunks."""
+    def build_context(self, state: RAGOnlyState) -> RAGOnlyState:
+        chunks = state.get("file_chunks", [])
+        if chunks:
+            context_parts = []
+            for i, c in enumerate(chunks):
+                context_parts.append(f"[DOC {i+1}] {c['source']}:\n{c['content']}")
+            state["context"] = "\n\n---\n\n".join(context_parts)
+        else:
+            state["context"] = ""
+        print(f"  📝 RAGContextNode: Built context from {len(chunks)} chunks")
+        return state
+class RAGAnswerNode:
+    """Node 3: Generate answer from document context."""
+    def __init__(self):
+        self.llm = Config.get_llm()
+        self.followup = FollowUpGenerator()
+    def answer(self, state: RAGOnlyState) -> RAGOnlyState:
+        query = state.get("query", "")
+        context = state.get("context", "")
+        chunks = state.get("file_chunks", [])
+        if not context:
+            state["answer"] = "📚 No documents found. Please upload files first using the 📎 button."
+            state["sources"] = []
+            state["followups"] = []
+            return state
+        prompt = f"""You are a document analysis assistant.
+Answer ONLY based on the provided documents. Do NOT use external knowledge.
+DOCUMENTS:
+{context}
+QUESTION: {query}
+Instructions:
+- Answer based ONLY on document content
+- Say "According to your documents..." when citing
+- Quote relevant parts when helpful
+- If info is not in documents, say so
+ANSWER:"""
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt}
+        ])
+        answer = resp.content
+        state["answer"] = answer
+        state["followups"] = self.followup.generate(answer, query)
+        # Build sources from chunks
+        seen = set()
+        sources = []
+        for c in chunks:
+            src = c.get("source", "Document")
+            if src not in seen:
+                sources.append({"title": f"📄 {src}", "url": ""})
+                seen.add(src)
+        state["sources"] = sources
+        print(f"  ✅ RAGAnswerNode: Generated answer from {len(sources)} sources")
+        return state
+# =============================================================================
+# AGENTIC RAG AGENTS (Multi-Agent Pipeline)
+# =============================================================================
+class AgenticPlannerNode:
+    """Node 1: Planner agent decides which sub-agents to activate."""
+    def plan(self, state: AgenticState) -> AgenticState:
+        query = state.get("query", "").lower()
+        # Determine which agents to use
+        state["use_file"] = any(w in query for w in [
+            "document", "file", "pdf", "uploaded", "summarize my",
+            "according to", "in the file", "extract", "my notes"
+        ])
+        state["use_web"] = any(w in query for w in [
+            "today", "current", "latest", "news", "weather", "stock",
+            "who is", "what is", "where", "when", "price", "live",
+            "recent", "update"
+        ]) or len(query.split()) <= 4
+        state["use_images"] = any(w in query for w in [
+            "image", "photo", "picture", "logo", "show me", "look like",
+            "flag", "screenshot"
+        ])
+        state["use_knowledge"] = any(w in query for w in [
+            "explain", "define", "concept", "theory", "how does",
+            "what is", "meaning of"
+        ])
+        print(f"  📋 AgenticPlannerNode: file={state['use_file']}, web={state['use_web']}, images={state['use_images']}")
+        return state
+class AgenticFileNode:
+    """Node 2: File agent retrieves from uploaded documents."""
+    def __init__(self, file_manager):
+        self.file_manager = file_manager
+    def retrieve(self, state: AgenticState) -> AgenticState:
+        if not state.get("use_file", False):
+            state["file_context"] = ""
+            state["file_sources"] = []
+            return state
+        query = state.get("query", "")
+        ws_id = state.get("workspace_id", "default")
+        ws = self.file_manager.get_workspace(ws_id)
+        if not ws.initialized:
+            state["file_context"] = ""
+            state["file_sources"] = []
+            return state
+        try:
+            chunks = ws.retrieve(query, k=6)
+            if chunks:
+                state["file_context"] = "\n\n".join([c.page_content for c in chunks])
+                state["file_sources"] = [
+                    {"title": f"📄 {c.metadata.get('source', 'Document')}", "url": ""}
+                    for c in chunks
+                ]
+                print(f"  📁 AgenticFileNode: Found {len(chunks)} chunks")
+            else:
+                state["file_context"] = ""
+                state["file_sources"] = []
+        except Exception as e:
+            print(f"  ❌ AgenticFileNode error: {e}")
+            state["file_context"] = ""
+            state["file_sources"] = []
+        return state
+class AgenticWebNode:
+    """Node 3: Web agent fetches real-time information."""
+    def __init__(self):
+        self.search_tool = SearchTool()
+        self.browse_tool = BrowseTool()
+    def search(self, state: AgenticState) -> AgenticState:
+        if not state.get("use_web", False):
+            state["web_context"] = ""
+            state["web_sources"] = []
+            state["links"] = []
+            return state
+        query = state.get("query", "")
+        try:
+            results = self.search_tool.search(query, num_results=4)
+            web_parts = []
+            sources = []
+            links = []
+            for r in results:
+                url = r.get("url")
+                title = r.get("title", "")
+                if not url:
+                    continue
+                content = self.browse_tool.fetch_clean(url)
+                if content:
+                    web_parts.append(f"[{title}]: {content[:1500]}")
+                    sources.append({"title": title, "url": url})
+                    links.append({"title": title, "url": url, "snippet": content[:150]})
+            state["web_context"] = "\n\n".join(web_parts)
+            state["web_sources"] = sources
+            state["links"] = links
+            print(f"  🌐 AgenticWebNode: Found {len(sources)} sources")
+        except Exception as e:
+            print(f"  ❌ AgenticWebNode error: {e}")
+            state["web_context"] = ""
+            state["web_sources"] = []
+            state["links"] = []
+        return state
+class AgenticKnowledgeNode:
+    """Node 4: Knowledge agent retrieves from base vector store."""
+    def __init__(self, vector_store: VectorStore):
+        self.vs = vector_store
+        self.reranker = Reranker()
+    def retrieve(self, state: AgenticState) -> AgenticState:
+        if not state.get("use_knowledge", False):
+            state["knowledge_context"] = ""
+            return state
+        query = state.get("query", "")
+        try:
+            chunks = self.vs.retrieve(query, k=4)
+            chunks = self.reranker.rerank(query, chunks, top_k=3)
+            if chunks:
+                state["knowledge_context"] = "\n\n".join([c.page_content for c in chunks])
+                print(f"  📚 AgenticKnowledgeNode: Found {len(chunks)} chunks")
+            else:
+                state["knowledge_context"] = ""
+        except Exception as e:
+            print(f"  ❌ AgenticKnowledgeNode error: {e}")
+            state["knowledge_context"] = ""
+        return state
+class AgenticImageNode:
+    """Node 5: Image agent fetches relevant images."""
+    def __init__(self, image_search):
+        self.image_search = image_search
+    def search(self, state: AgenticState) -> AgenticState:
+        if not state.get("use_images", False):
+            state["images"] = []
+            return state
+        query = state.get("query", "")
+        try:
+            images = self.image_search.search(query, count=6)
+            state["images"] = images
+            print(f"  🖼️ AgenticImageNode: Found {len(images)} images")
+        except Exception as e:
+            print(f"  ❌ AgenticImageNode error: {e}")
+            state["images"] = []
+        return state
+class AgenticSynthesizerNode:
+    """Node 6: Synthesizer agent combines all contexts and generates final answer."""
+    def __init__(self):
+        self.llm = Config.get_llm()
+        self.followup = FollowUpGenerator()
+    def synthesize(self, state: AgenticState) -> AgenticState:
+        query = state.get("query", "")
+        # Build combined context
+        contexts = []
+        if state.get("file_context"):
+            contexts.append(f"📄 FROM YOUR DOCUMENTS:\n{state['file_context'][:2500]}")
+        if state.get("web_context"):
+            contexts.append(f"🌐 FROM THE WEB:\n{state['web_context'][:2500]}")
+        if state.get("knowledge_context"):
+            contexts.append(f"📚 KNOWLEDGE BASE:\n{state['knowledge_context'][:1500]}")
+        if not contexts:
+            contexts.append("No specific context found. Using general knowledge.")
+        combined = "\n\n---\n\n".join(contexts)
+        state["combined_context"] = combined
+        prompt = f"""You are an AGENTIC AI assistant that synthesizes information from multiple sources.
+AVAILABLE CONTEXT:
+{combined}
+USER QUESTION: {query}
+INSTRUCTIONS:
+1. Prioritize user's documents (📄) if relevant
+2. Add real-time info from web (🌐) when available
+3. Use knowledge base (📚) for background
+4. Cite sources appropriately
+5. Be comprehensive but concise
+SYNTHESIZED ANSWER:"""
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt}
+        ])
+        answer = resp.content
+        state["answer"] = answer
+        state["followups"] = self.followup.generate(answer, query)
+        # Combine sources
+        all_sources = state.get("file_sources", []) + state.get("web_sources", [])
+        state["sources"] = all_sources
+        print(f"  ✅ AgenticSynthesizerNode: Generated answer with {len(all_sources)} sources")
+        return state
+# =============================================================================
+# ANALYSIS AGENTS
+# =============================================================================
+class AnalysisSearchNode:
+    """Node 1: Search for analysis data."""
+    def __init__(self):
+        self.search_tool = SearchTool()
+        self.browse_tool = BrowseTool()
+    def search(self, state: AnalysisState) -> AnalysisState:
+        query = state.get("query", "")
+        print(f"  🔍 AnalysisSearchNode: Searching for analysis data")
+        try:
+            results = self.search_tool.search(query, num_results=6)
+            state["web_results"] = results
+            # Fetch content
+            web_parts = []
+            links = []
+            for r in results:
+                url = r.get("url")
+                title = r.get("title", "")
+                if not url:
+                    continue
+                content = self.browse_tool.fetch_clean(url)
+                if content:
+                    web_parts.append(f"[{title}]:\n{content[:2000]}")
+                    links.append({"title": title, "url": url, "snippet": content[:200]})
+            state["web_context"] = "\n\n".join(web_parts)
+            state["links"] = links
+            state["sources"] = [{"title": l["title"], "url": l["url"]} for l in links]
+        except Exception as e:
+            print(f"  ❌ AnalysisSearchNode error: {e}")
+            state["web_context"] = ""
+            state["links"] = []
+            state["sources"] = []
+        return state
+class AnalysisProcessNode:
+    """Node 2: Generate structured analysis."""
+    def __init__(self):
+        self.llm = Config.get_llm()
+        self.followup = FollowUpGenerator()
+    def analyze(self, state: AnalysisState) -> AnalysisState:
+        query = state.get("query", "")
+        context = state.get("web_context", "")
+        prompt = f"""You are an expert analyst. Provide deep, comprehensive analysis.
+RESEARCH DATA:
+{context if context else "No external data available."}
+ANALYSIS REQUEST: {query}
+Provide structured analysis with:
+## Executive Summary
+(2-3 sentence overview)
+## Key Findings
+(Bullet points of main discoveries)
+## Detailed Analysis
+(In-depth examination with evidence)
+## Data & Statistics
+(Numbers, trends, comparisons if available)
+## Conclusions
+(Main takeaways)
+## Recommendations
+(Actionable suggestions)
+Use citations [1], [2] when referencing sources.
+ANALYSIS:"""
+        resp = self.llm.invoke([
+            {"role": "system", "content": PPLX_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt}
+        ])
+        answer = resp.content
+        state["answer"] = answer
+        state["followups"] = self.followup.generate(answer, query)
+        print(f"  ✅ AnalysisProcessNode: Generated analysis")
+        return state
+# =============================================================================
+# SUMMARIZE AGENTS
+# =============================================================================
+class SummarizeInputNode:
+    """Node 1: Determine input type and fetch content."""
+    def __init__(self):
+        self.browse_tool = BrowseTool()
+        self.search_tool = SearchTool()
+    def process_input(self, state: SummarizeState) -> SummarizeState:
+        query = state.get("query", "")
+        # Check if URL
+        if query.startswith("http"):
+            state["is_url"] = True
+            try:
+                content = self.browse_tool.fetch_clean(query)
+                state["content"] = content or ""
+                state["links"] = [{"title": "Source", "url": query, "snippet": content[:200] if content else ""}]
+                state["sources"] = [{"title": "Source URL", "url": query}]
+                print(f"  🔗 SummarizeInputNode: Fetched URL content")
+            except Exception as e:
+                print(f"  ❌ Error fetching URL: {e}")
+                state["content"] = ""
+        else:
+            state["is_url"] = False
+            # Search and fetch
+            try:
+                results = self.search_tool.search(query, num_results=3)
+                content_parts = []
+                links = []
+                for r in results:
+                    url = r.get("url")
+                    title = r.get("title", "")
+                    if url:
+                        text = self.browse_tool.fetch_clean(url)
+                        if text:
+                            content_parts.append(text[:1500])
+                            links.append({"title": title, "url": url, "snippet": text[:150]})
+                state["content"] = "\n\n".join(content_parts)
+                state["links"] = links
+                state["sources"] = [{"title": l["title"], "url": l["url"]} for l in links]
+                print(f"  🔍 SummarizeInputNode: Fetched {len(links)} sources")
+            except Exception as e:
+                print(f"  ❌ Error searching: {e}")
+                state["content"] = query  # Use query as content
+                state["links"] = []
+                state["sources"] = []
+        return state
+class SummarizeProcessNode:
+    """Node 2: Generate summary."""
+    def __init__(self):
+        self.summarizer = SummarizerTool()
+        self.followup = FollowUpGenerator()
+    def summarize(self, state: SummarizeState) -> SummarizeState:
+        content = state.get("content", "")
+        query = state.get("query", "")
+        if content:
+            summary = self.summarizer.summarize(content, max_words=300)
+            state["answer"] = summary
+        else:
+            state["answer"] = "Could not find content to summarize."
+        state["followups"] = self.followup.generate(state["answer"], query)
+        print(f"  ✅ SummarizeProcessNode: Generated summary")
+        return state

rag/graph_deep.py ADDED Viewed

	@@ -0,0 +1,285 @@

+"""
+Production-Level LangGraph Pipelines for Perplexity Clone
+==========================================================
+Each mode has its own graph with proper node structure.
+"""
+from langgraph.graph import StateGraph, END
+from rag.rag_state import (
+    RAGState,
+    WebSearchState,
+    RAGOnlyState,
+    AgenticState,
+    AnalysisState,
+    SummarizeState
+)
+from rag.agents import (
+    # Deep Research agents
+    PlannerAgent,
+    ResearchAgent,
+    AggregatorAgent,
+    WriterAgent,
+    ValidatorAgent,
+    # Web Search agents
+    WebSearchNode,
+    WebFetchNode,
+    WebContextNode,
+    WebAnswerNode,
+    # RAG agents
+    RAGRetrieveNode,
+    RAGContextNode,
+    RAGAnswerNode,
+    # Agentic agents
+    AgenticPlannerNode,
+    AgenticFileNode,
+    AgenticWebNode,
+    AgenticKnowledgeNode,
+    AgenticImageNode,
+    AgenticSynthesizerNode,
+    # Analysis agents
+    AnalysisSearchNode,
+    AnalysisProcessNode,
+    # Summarize agents
+    SummarizeInputNode,
+    SummarizeProcessNode,
+)
+from vectorstore.store import VectorStore
+class DeepResearchGraph:
+    """
+    Deep Research Mode Graph
+    ========================
+    Pipeline: Planner → Research → Aggregate → Write → Validate
+    Used for complex queries requiring multi-step analysis.
+    """
+    def __init__(self, vector_store: VectorStore) -> None:
+        self.vs = vector_store
+        self.planner = PlannerAgent()
+        self.researcher = ResearchAgent(self.vs)
+        self.aggregator = AggregatorAgent()
+        self.writer = WriterAgent()
+        self.validator = ValidatorAgent()
+        self.graph = None
+    def build(self):
+        g = StateGraph(RAGState)
+        g.add_node("plan", self.planner.plan)
+        g.add_node("research", self.researcher.research)
+        g.add_node("aggregate", self.aggregator.aggregate)
+        g.add_node("write", self.writer.write)
+        g.add_node("validate", self.validator.validate_and_attach)
+        g.set_entry_point("plan")
+        g.add_edge("plan", "research")
+        g.add_edge("research", "aggregate")
+        g.add_edge("aggregate", "write")
+        g.add_edge("write", "validate")
+        g.add_edge("validate", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, question: str) -> RAGState:
+        if self.graph is None:
+            self.build()
+        print(f"\n🧠 DEEP RESEARCH GRAPH: {question[:50]}...")
+        return self.graph.invoke({"question": question})
+class WebSearchGraph:
+    """
+    Web Search Mode Graph
+    =====================
+    Pipeline: Search → Fetch → Context → Answer
+    Used for real-time web queries with citations.
+    """
+    def __init__(self):
+        self.search_node = WebSearchNode()
+        self.fetch_node = WebFetchNode()
+        self.context_node = WebContextNode()
+        self.answer_node = WebAnswerNode()
+        self.graph = None
+    def build(self):
+        g = StateGraph(WebSearchState)
+        g.add_node("search", self.search_node.search)
+        g.add_node("fetch", self.fetch_node.fetch)
+        g.add_node("context", self.context_node.build_context)
+        g.add_node("answer", self.answer_node.answer)
+        g.set_entry_point("search")
+        g.add_edge("search", "fetch")
+        g.add_edge("fetch", "context")
+        g.add_edge("context", "answer")
+        g.add_edge("answer", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, query: str) -> WebSearchState:
+        if self.graph is None:
+            self.build()
+        print(f"\n🌐 WEB SEARCH GRAPH: {query[:50]}...")
+        return self.graph.invoke({"query": query})
+class RAGOnlyGraph:
+    """
+    RAG-Only Mode Graph
+    ===================
+    Pipeline: Retrieve → Context → Answer
+    Used for searching uploaded documents only.
+    """
+    def __init__(self, file_manager):
+        self.retrieve_node = RAGRetrieveNode(file_manager)
+        self.context_node = RAGContextNode()
+        self.answer_node = RAGAnswerNode()
+        self.graph = None
+    def build(self):
+        g = StateGraph(RAGOnlyState)
+        g.add_node("retrieve", self.retrieve_node.retrieve)
+        g.add_node("context", self.context_node.build_context)
+        g.add_node("answer", self.answer_node.answer)
+        g.set_entry_point("retrieve")
+        g.add_edge("retrieve", "context")
+        g.add_edge("context", "answer")
+        g.add_edge("answer", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, query: str, workspace_id: str = "default") -> RAGOnlyState:
+        if self.graph is None:
+            self.build()
+        print(f"\n📚 RAG ONLY GRAPH: {query[:50]}...")
+        return self.graph.invoke({"query": query, "workspace_id": workspace_id})
+class AgenticRAGGraph:
+    """
+    Agentic RAG Mode Graph
+    ======================
+    Pipeline: Planner → [File, Web, Knowledge, Image] → Synthesizer
+    Multi-agent collaboration for comprehensive answers.
+    Planner decides which agents to activate.
+    """
+    def __init__(self, file_manager, vector_store: VectorStore, image_search):
+        self.planner_node = AgenticPlannerNode()
+        self.file_node = AgenticFileNode(file_manager)
+        self.web_node = AgenticWebNode()
+        self.knowledge_node = AgenticKnowledgeNode(vector_store)
+        self.image_node = AgenticImageNode(image_search)
+        self.synthesizer_node = AgenticSynthesizerNode()
+        self.graph = None
+    def build(self):
+        g = StateGraph(AgenticState)
+        # Add all nodes
+        g.add_node("planner", self.planner_node.plan)
+        g.add_node("file_agent", self.file_node.retrieve)
+        g.add_node("web_agent", self.web_node.search)
+        g.add_node("knowledge_agent", self.knowledge_node.retrieve)
+        g.add_node("image_agent", self.image_node.search)
+        g.add_node("synthesizer", self.synthesizer_node.synthesize)
+        # Define flow
+        g.set_entry_point("planner")
+        # After planner, run all agents (they check flags internally)
+        g.add_edge("planner", "file_agent")
+        g.add_edge("file_agent", "web_agent")
+        g.add_edge("web_agent", "knowledge_agent")
+        g.add_edge("knowledge_agent", "image_agent")
+        g.add_edge("image_agent", "synthesizer")
+        g.add_edge("synthesizer", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, query: str, workspace_id: str = "default") -> AgenticState:
+        if self.graph is None:
+            self.build()
+        print(f"\n🤖 AGENTIC RAG GRAPH: {query[:50]}...")
+        return self.graph.invoke({"query": query, "workspace_id": workspace_id})
+class AnalysisGraph:
+    """
+    Analysis Mode Graph
+    ===================
+    Pipeline: Search → Analyze
+    Deep analysis with structured output format.
+    """
+    def __init__(self):
+        self.search_node = AnalysisSearchNode()
+        self.process_node = AnalysisProcessNode()
+        self.graph = None
+    def build(self):
+        g = StateGraph(AnalysisState)
+        g.add_node("search", self.search_node.search)
+        g.add_node("analyze", self.process_node.analyze)
+        g.set_entry_point("search")
+        g.add_edge("search", "analyze")
+        g.add_edge("analyze", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, query: str) -> AnalysisState:
+        if self.graph is None:
+            self.build()
+        print(f"\n📊 ANALYSIS GRAPH: {query[:50]}...")
+        return self.graph.invoke({"query": query})
+class SummarizeGraph:
+    """
+    Summarize Mode Graph
+    ====================
+    Pipeline: Input → Summarize
+    Handles URL or search-based summarization.
+    """
+    def __init__(self):
+        self.input_node = SummarizeInputNode()
+        self.process_node = SummarizeProcessNode()
+        self.graph = None
+    def build(self):
+        g = StateGraph(SummarizeState)
+        g.add_node("input", self.input_node.process_input)
+        g.add_node("summarize", self.process_node.summarize)
+        g.set_entry_point("input")
+        g.add_edge("input", "summarize")
+        g.add_edge("summarize", END)
+        self.graph = g.compile()
+        return self.graph
+    def run(self, query: str) -> SummarizeState:
+        if self.graph is None:
+            self.build()
+        print(f"\n📝 SUMMARIZE GRAPH: {query[:50]}...")
+        return self.graph.invoke({"query": query})

rag/rag_state.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from typing import List, Dict, Any, Optional
+from typing_extensions import TypedDict
+class RAGState(TypedDict, total=False):
+    """State object for deep research pipeline."""
+    question: str
+    sub_questions: List[str]
+    local_docs: List[Dict[str, Any]]  # Serializable version of docs
+    web_pages: List[Dict]              # {"title","url","content"}
+    evidence: List[str]                # text snippets
+    draft_answers: List[str]           # per sub-question
+    final_answer: str
+    sources: List[Dict]                # {"title","url"}
+class WebSearchState(TypedDict, total=False):
+    """State for Web Search mode graph."""
+    query: str
+    search_results: List[Dict]
+    web_pages: List[Dict]
+    context: str
+    answer: str
+    sources: List[Dict]
+    links: List[Dict]
+    images: List[Dict]
+    followups: List[str]
+class RAGOnlyState(TypedDict, total=False):
+    """State for RAG-only mode graph."""
+    query: str
+    workspace_id: str
+    file_chunks: List[Dict]
+    base_chunks: List[Dict]
+    context: str
+    answer: str
+    sources: List[Dict]
+    followups: List[str]
+class AgenticState(TypedDict, total=False):
+    """State for Agentic RAG mode graph."""
+    query: str
+    workspace_id: str
+    # Planner outputs
+    use_file: bool
+    use_web: bool
+    use_images: bool
+    use_knowledge: bool
+    # Agent outputs
+    file_context: str
+    file_sources: List[Dict]
+    web_context: str
+    web_sources: List[Dict]
+    links: List[Dict]
+    knowledge_context: str
+    images: List[Dict]
+    # Synthesizer output
+    combined_context: str
+    answer: str
+    sources: List[Dict]
+    followups: List[str]
+class AnalysisState(TypedDict, total=False):
+    """State for Analysis mode graph."""
+    query: str
+    web_results: List[Dict]
+    web_context: str
+    analysis: str
+    executive_summary: str
+    key_findings: List[str]
+    answer: str
+    sources: List[Dict]
+    links: List[Dict]
+    images: List[Dict]
+    followups: List[str]
+class SummarizeState(TypedDict, total=False):
+    """State for Summarize mode graph."""
+    query: str
+    is_url: bool
+    content: str
+    summary: str
+    answer: str
+    sources: List[Dict]
+    links: List[Dict]
+    followups: List[str]

rag/router.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import re
+from config.config import Config
+class RouterAgent:
+    """
+    Production-grade router exactly like Perplexity:
+    1. Rule-based fast routing
+    2. NER-based entity detection
+    3. Real-time classifier
+    4. LLM semantic classifier (handles ANY query)
+    """
+    def __init__(self):
+        self.llm = Config.get_llm()
+    # ---------------- FAST RULES ----------------
+    def contains(self, q, words):
+        q = q.lower()
+        return any(w in q for w in words)
+    def is_greeting(self, q):
+        q_low = q.lower().strip()
+        return q_low in ["hi", "hello", "hey", "yo", "sup", "hi there", "hello there"]
+    def is_image_query(self, q):
+        image_words = ["image", "photo", "pic", "picture", "logo", "wallpaper", "screenshot"]
+        return self.contains(q, image_words)
+    def is_realtime(self, q):
+        realtime = [
+            "today", "now", "latest", "current",
+            "price", "stock", "weather", "news",
+            "update", "live", "score", "match", "schedule"
+        ]
+        return self.contains(q, realtime)
+    def is_world_fact(self, q):
+        patterns = [
+            "prime minister", "president", "capital of",
+            "ceo", "founder", "population", "richest",
+            "oldest", "largest", "smallest", "currency",
+            "country", "state", "city", "minister",
+            "government", "party"
+        ]
+        return self.contains(q, patterns)
+    def is_ai_model(self, q):
+        ai_models = ["gpt", "gemini", "llama", "claude", "grok", "mistral", "phi"]
+        return self.contains(q, ai_models)
+    def is_definition(self, q):
+        q = q.lower()
+        return q.startswith(("what is", "define", "explain"))
+    def is_deep(self, q):
+        q = q.lower()
+        return any(x in q for x in [
+            "compare", "analysis", "impact", "advantages", "disadvantages",
+            "evaluate", "future", "strategy", "risk"
+        ])
+    def is_entity(self, q):
+        """Detects entities by uppercase words"""
+        words = q.split()
+        caps = [w for w in words if w[:1].isupper()]
+        return len(caps) >= 1
+    # ---------------- LLM CLASSIFIER ----------------
+    def llm_decide(self, q):
+        """
+        FINAL DECISION MAKER.
+        If rules fail or query is unusual → LLM decides mode.
+        """
+        system = {
+            "role": "system",
+            "content": """
+Classify this query into exactly one mode:
+- "web" → real-time facts, entities, news, people, companies, trending topics
+- "rag" → definitions, conceptual explanations, structured factual info
+- "llm" → normal chat, creative tasks, responses without external info
+- "deep_research" → multi-step analysis, long reports, deep comparisons
+Return ONLY one word: web, rag, llm, or deep_research.
+"""
+        }
+        user = {"role": "user", "content": q}
+        resp = self.llm.invoke([system, user]).content.strip().lower()
+        if resp in ["web", "rag", "llm", "deep_research"]:
+            return resp
+        return "llm"
+    # ---------------- FINAL ROUTER ----------------
+    def route(self, q: str) -> str:
+        q = q.strip()
+        # LAYER 1 — FAST RULES
+        if self.is_greeting(q): return "llm"
+        if self.is_image_query(q): return "image"
+        if self.is_realtime(q): return "web"
+        if self.is_world_fact(q): return "web"
+        if self.is_ai_model(q): return "web"
+        # Short entity queries (1-2 words) → web
+        if len(q.split()) <= 2 and self.is_entity(q): return "web"
+        if self.is_deep(q): return "deep_research"
+        if self.is_definition(q): return "rag"
+        # LAYER 2 — LLM SEMANTIC CLASSIFICATION
+        return self.llm_decide(q)

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+# Core LLM + tools
+langchain
+langchain-core
+langchain-community
+langgraph
+# LLM providers (Groq via LangChain)
+langchain-groq
+# Web API backend
+fastapi
+uvicorn[standard]
+pydantic
+python-dotenv
+# Embeddings + vector search
+sentence-transformers
+faiss-cpu
+# Web search + HTTP
+requests
+tavily-python
+# Scraping
+trafilatura
+beautifulsoup4
+# Wikipedia tool dependency
+wikipedia
+# PDF/text load support
+pypdf

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,709 @@

+import streamlit as st
+import requests
+from urllib.parse import urlparse
+# =====================================
+# PAGE CONFIG
+# =====================================
+st.set_page_config(
+    page_title="Perplexity AI Clone",
+    page_icon="🔍",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# =====================================
+# SESSION STATE
+# =====================================
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "mode" not in st.session_state:
+    st.session_state.mode = "Automatic"
+if "current_result" not in st.session_state:
+    st.session_state.current_result = None
+if "theme" not in st.session_state:
+    st.session_state.theme = "dark"
+if "uploaded_files" not in st.session_state:
+    st.session_state.uploaded_files = []
+if "show_upload" not in st.session_state:
+    st.session_state.show_upload = False
+# =====================================
+# CONFIGURATION
+# =====================================
+API_URL = "http://localhost:8000"
+WORKSPACE = "default"
+# MODE MAPPING - All 8 modes with correct backend endpoints
+MODES = {
+    "Automatic": {
+        "icon": "🔍",
+        "desc": "Auto-routes to best mode",
+        "endpoint": "/api/chat"
+    },
+    "Web Search": {
+        "icon": "🌐",
+        "desc": "Real-time web search",
+        "endpoint": "/api/web"
+    },
+    "RAG": {
+        "icon": "📚",
+        "desc": "Search uploaded documents",
+        "endpoint": "/api/rag"
+    },
+    "Agentic": {
+        "icon": "🤖",
+        "desc": "Multi-agent collaboration",
+        "endpoint": "/api/agentic"
+    },
+    "Deep Research": {
+        "icon": "🧠",
+        "desc": "In-depth research",
+        "endpoint": "/api/deep_research"
+    },
+    "Analysis": {
+        "icon": "📊",
+        "desc": "Deep data analysis",
+        "endpoint": "/api/analyze"
+    },
+    "Summarize": {
+        "icon": "📝",
+        "desc": "Summarize content",
+        "endpoint": "/api/summarize"
+    },
+    "Chat": {
+        "icon": "💬",
+        "desc": "Direct AI chat",
+        "endpoint": "/api/focus"
+    },
+}
+# =====================================
+# CSS - PERPLEXITY EXACT STYLE
+# =====================================
+def get_css():
+    is_dark = st.session_state.theme == "dark"
+    if is_dark:
+        colors = {
+            "bg": "#191A1A",
+            "bg2": "#1F2020",
+            "bg3": "#2A2B2B",
+            "text": "#ECECEC",
+            "text2": "#A1A1A1",
+            "muted": "#6B6B6B",
+            "accent": "#20B8CD",
+            "border": "#3A3B3B",
+            "success": "#22C55E"
+        }
+    else:
+        colors = {
+            "bg": "#FFFFFF",
+            "bg2": "#F7F7F8",
+            "bg3": "#EEEEEF",
+            "text": "#1A1A1A",
+            "text2": "#666666",
+            "muted": "#999999",
+            "accent": "#0EA5E9",
+            "border": "#E5E5E5",
+            "success": "#22C55E"
+        }
+    return f"""
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap');
+    * {{ font-family: 'Inter', sans-serif !important; }}
+    #MainMenu, footer, header, [data-testid="stToolbar"], .stDeployButton {{ display: none !important; }}
+    .stApp {{ background: {colors['bg']} !important; }}
+    [data-testid="stSidebar"] {{
+        background: {colors['bg']} !important;
+        border-right: 1px solid {colors['border']} !important;
+    }}
+    /* Hero */
+    .hero {{
+        text-align: center;
+        padding: 30px 0 15px;
+    }}
+    .hero-compact {{
+        text-align: center;
+        padding: 15px 0 10px;
+    }}
+    .hero-compact .logo {{
+        font-size: 28px;
+    }}
+    .hero-compact .tagline {{
+        display: none;
+    }}
+    .logo {{
+        font-size: 40px;
+        font-weight: 600;
+        color: {colors['text']};
+        letter-spacing: -1px;
+    }}
+    .logo span {{
+        background: linear-gradient(135deg, {colors['accent']}, #14B8A6);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }}
+    .tagline {{
+        color: {colors['muted']};
+        font-size: 14px;
+        margin-top: 5px;
+    }}
+    /* UNIFIED SEARCH BOX - All elements inside */
+    .search-wrapper {{
+        max-width: 800px;
+        margin: 0 auto;
+        padding: 0 20px;
+    }}
+    /* Hide streamlit defaults */
+    .stTextInput > div > div {{
+        background: {colors['bg2']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 25px !important;
+    }}
+    .stTextInput input {{
+        background: transparent !important;
+        border: none !important;
+        color: {colors['text']} !important;
+        font-size: 15px !important;
+        padding: 12px 16px !important;
+    }}
+    .stTextInput input::placeholder {{
+        color: {colors['muted']} !important;
+    }}
+    .stTextInput label {{ display: none !important; }}
+    .stSelectbox > div > div {{
+        background: {colors['bg3']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 18px !important;
+    }}
+    .stSelectbox [data-baseweb="select"] > div {{
+        background: {colors['bg3']} !important;
+        border: none !important;
+    }}
+    .stSelectbox [data-baseweb="select"] > div > div {{
+        color: {colors['text']} !important;
+    }}
+    /* Dropdown menu styling */
+    [data-baseweb="popover"] {{
+        background: {colors['bg2']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 12px !important;
+    }}
+    [data-baseweb="menu"] {{
+        background: {colors['bg2']} !important;
+    }}
+    [data-baseweb="menu"] li {{
+        background: {colors['bg2']} !important;
+        color: {colors['text']} !important;
+    }}
+    [data-baseweb="menu"] li:hover {{
+        background: {colors['bg3']} !important;
+    }}
+    .stSelectbox label {{ display: none !important; }}
+    /* Buttons - theme aware */
+    .stButton > button {{
+        background: {colors['bg2']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 12px !important;
+        color: {colors['text']} !important;
+        font-size: 16px !important;
+        padding: 8px 16px !important;
+        transition: all 0.2s !important;
+    }}
+    .stButton > button:hover {{
+        background: {colors['accent']} !important;
+        color: white !important;
+        border-color: {colors['accent']} !important;
+    }}
+    .stButton > button:active {{
+        background: {colors['accent']} !important;
+    }}
+    /* Form submit button */
+    .stFormSubmitButton > button {{
+        background: {colors['bg3']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 20px !important;
+        color: {colors['text']} !important;
+    }}
+    .stFormSubmitButton > button:hover {{
+        background: {colors['accent']} !important;
+        color: white !important;
+        border-color: {colors['accent']} !important;
+    }}
+    /* File uploader styling - COMPLETE FIX */
+    .stFileUploader {{
+        max-width: 600px;
+        margin: 10px auto;
+    }}
+    .stFileUploader > div {{
+        background: transparent !important;
+    }}
+    .stFileUploader > div > div {{
+        background: transparent !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] {{
+        background: {colors['bg2']} !important;
+        border: 2px dashed {colors['border']} !important;
+        border-radius: 12px !important;
+        padding: 20px !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"]:hover {{
+        border-color: {colors['accent']} !important;
+    }}
+    /* All text inside dropzone */
+    .stFileUploader [data-testid="stFileUploaderDropzone"] * {{
+        color: {colors['text']} !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] span {{
+        color: {colors['text']} !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] p {{
+        color: {colors['text']} !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] small {{
+        color: {colors['text2']} !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] svg {{
+        fill: {colors['text2']} !important;
+        stroke: {colors['text2']} !important;
+    }}
+    .stFileUploader [data-testid="stFileUploaderDropzone"] button {{
+        background: {colors['accent']} !important;
+        color: white !important;
+        border: none !important;
+        border-radius: 8px !important;
+    }}
+    .stFileUploader label {{
+        color: {colors['text']} !important;
+        font-size: 14px !important;
+    }}
+    .stFileUploader > section {{
+        background: transparent !important;
+        border: none !important;
+    }}
+    .stFileUploader > section > div {{
+        background: transparent !important;
+    }}
+    /* Answer box */
+    .answer-box {{
+        background: {colors['bg2']};
+        border: 1px solid {colors['border']};
+        border-radius: 16px;
+        padding: 24px;
+        color: {colors['text']};
+        font-size: 15px;
+        line-height: 1.8;
+    }}
+    /* Source cards */
+    .source-card {{
+        background: {colors['bg3']};
+        border: 1px solid {colors['border']};
+        border-radius: 10px;
+        padding: 12px;
+        margin-bottom: 8px;
+        transition: all 0.2s;
+    }}
+    .source-card:hover {{
+        border-color: {colors['accent']};
+    }}
+    .source-title {{
+        color: {colors['accent']};
+        font-size: 13px;
+        font-weight: 500;
+        text-decoration: none;
+    }}
+    .source-domain {{
+        color: {colors['muted']};
+        font-size: 11px;
+    }}
+    /* Query display */
+    .query-box {{
+        background: {colors['bg2']};
+        border: 1px solid {colors['border']};
+        border-radius: 12px;
+        padding: 16px;
+        margin: 15px 0;
+    }}
+    .query-text {{
+        color: {colors['text']};
+        font-size: 17px;
+        font-weight: 500;
+    }}
+    .query-mode {{
+        color: {colors['accent']};
+        font-size: 12px;
+        margin-top: 6px;
+    }}
+    /* Tabs */
+    .stTabs [data-baseweb="tab-list"] {{
+        background: transparent !important;
+        border-bottom: 1px solid {colors['border']} !important;
+        gap: 0 !important;
+    }}
+    .stTabs [data-baseweb="tab"] {{
+        background: transparent !important;
+        color: {colors['text2']} !important;
+    }}
+    .stTabs [data-baseweb="tab"][aria-selected="true"] {{
+        color: {colors['accent']} !important;
+        border-bottom-color: {colors['accent']} !important;
+    }}
+    .stTabs [data-baseweb="tab-panel"] {{
+        padding-top: 1rem !important;
+    }}
+    /* Answer text styling */
+    .stTabs [data-testid="stMarkdownContainer"] {{
+        color: {colors['text']} !important;
+        font-size: 15px !important;
+        line-height: 1.7 !important;
+    }}
+    /* Mode desc text */
+    .mode-desc {{
+        text-align: center;
+        color: {colors['muted']};
+        font-size: 12px;
+        margin-top: 8px;
+    }}
+    /* Column spacing fix */
+    [data-testid="column"] {{ padding: 0 2px !important; }}
+    /* Expander styling */
+    .streamlit-expanderHeader {{
+        background: {colors['bg3']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 8px !important;
+        color: {colors['text']} !important;
+    }}
+    .streamlit-expanderContent {{
+        background: {colors['bg2']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-top: none !important;
+        border-radius: 0 0 8px 8px !important;
+        color: {colors['text']} !important;
+    }}
+    [data-testid="stExpander"] {{
+        background: {colors['bg2']} !important;
+        border: 1px solid {colors['border']} !important;
+        border-radius: 8px !important;
+    }}
+    [data-testid="stExpander"] summary {{
+        color: {colors['text']} !important;
+    }}
+    [data-testid="stExpander"] [data-testid="stMarkdownContainer"] {{
+        color: {colors['text']} !important;
+    }}
+    /* Spinner and alerts */
+    .stSpinner > div {{
+        border-color: {colors['accent']} !important;
+    }}
+    .stAlert {{
+        background: {colors['bg2']} !important;
+        color: {colors['text']} !important;
+        border: 1px solid {colors['border']} !important;
+    }}
+    /* Caption text */
+    .stCaption, [data-testid="stCaptionContainer"] {{
+        color: {colors['text2']} !important;
+    }}
+    /* Divider */
+    hr {{
+        border-color: {colors['border']} !important;
+    }}
+    </style>
+    """
+st.markdown(get_css(), unsafe_allow_html=True)
+# =====================================
+# HELPER FUNCTIONS
+# =====================================
+def call_api(query: str, mode: str):
+    """Call backend API based on selected mode."""
+    mode_config = MODES.get(mode, MODES["Automatic"])
+    endpoint = mode_config["endpoint"]
+    payload = {
+        "message": query,
+        "workspace_id": WORKSPACE,
+        "mode": mode.lower().replace(" ", "_")
+    }
+    try:
+        response = requests.post(f"{API_URL}{endpoint}", json=payload, timeout=180)
+        return response.json()
+    except Exception as e:
+        return {
+            "answer": f"Error: {str(e)}",
+            "sources": [],
+            "links": [],
+            "images": [],
+            "followups": []
+        }
+def upload_files(files):
+    """Upload files to backend."""
+    if not files:
+        return False
+    files_payload = [
+        ("files", (f.name, f.getvalue(), f.type or "application/octet-stream"))
+        for f in files
+    ]
+    try:
+        r = requests.post(
+            f"{API_URL}/api/upload_docs",
+            data={"workspace_id": WORKSPACE},
+            files=files_payload,
+            timeout=60
+        )
+        return r.ok
+    except:
+        return False
+def get_domain(url: str) -> str:
+    try:
+        return urlparse(url).netloc.replace('www.', '')
+    except:
+        return url[:30]
+# =====================================
+# THEME TOGGLE
+# =====================================
+col_spacer, col_theme = st.columns([12, 1])
+with col_theme:
+    theme_icon = "🌙" if st.session_state.theme == "dark" else "☀️"
+    if st.button(theme_icon, key="theme_toggle"):
+        st.session_state.theme = "light" if st.session_state.theme == "dark" else "dark"
+        st.rerun()
+# =====================================
+# HERO - Always show
+# =====================================
+if st.session_state.current_result:
+    # Compact version when showing results
+    st.markdown("""
+    <div class="hero-compact">
+        <div class="logo">perplexity<span>clone</span></div>
+    </div>
+    """, unsafe_allow_html=True)
+else:
+    # Full version on home
+    st.markdown("""
+    <div class="hero">
+        <div class="logo">perplexity<span>clone</span></div>
+        <div class="tagline">Where knowledge begins</div>
+    </div>
+    """, unsafe_allow_html=True)
+# =====================================
+# UNIFIED SEARCH BOX (All elements inside)
+# =====================================
+st.markdown('<div class="search-wrapper">', unsafe_allow_html=True)
+# Single row with everything inside
+col1, col2, col3, col4 = st.columns([2, 8, 1, 1])
+with col1:
+    # Mode selector dropdown
+    mode_list = list(MODES.keys())
+    current_idx = mode_list.index(st.session_state.mode)
+    selected = st.selectbox(
+        "mode",
+        mode_list,
+        index=current_idx,
+        format_func=lambda x: f"{MODES[x]['icon']} {x}",
+        label_visibility="collapsed",
+        key="mode_select"
+    )
+    if selected != st.session_state.mode:
+        st.session_state.mode = selected
+        st.rerun()
+with col2:
+    # Search input
+    query = st.text_input(
+        "search",
+        placeholder="Ask anything...",
+        label_visibility="collapsed",
+        key="query_input"
+    )
+with col3:
+    # File upload icon button - toggles file picker
+    if st.button("📎", key="attach_btn", help="Upload files"):
+        st.session_state.show_upload = not st.session_state.show_upload
+with col4:
+    # Submit button
+    submit = st.button("→", key="submit_btn", help="Search")
+st.markdown('</div>', unsafe_allow_html=True)
+# Mode description
+st.markdown(f'<div class="mode-desc">{MODES[st.session_state.mode]["icon"]} {st.session_state.mode}: {MODES[st.session_state.mode]["desc"]}</div>', unsafe_allow_html=True)
+# Show file uploader when icon is clicked
+if st.session_state.show_upload:
+    uploaded = st.file_uploader(
+        "Upload documents (PDF, TXT, MD, PPTX)",
+        type=["pdf", "txt", "md", "pptx"],
+        accept_multiple_files=True,
+        key="file_uploader"
+    )
+    if uploaded:
+        with st.spinner("📤 Uploading..."):
+            if upload_files(uploaded):
+                new_files = [f.name for f in uploaded if f.name not in st.session_state.uploaded_files]
+                if new_files:
+                    st.session_state.uploaded_files.extend(new_files)
+                    st.success(f"✅ {len(new_files)} file(s) uploaded!")
+                    st.session_state.show_upload = False
+                    st.rerun()
+# Show uploaded files count
+if st.session_state.uploaded_files:
+    st.caption(f"📁 {len(st.session_state.uploaded_files)} file(s) ready for RAG")
+# =====================================
+# HANDLE SEARCH
+# =====================================
+if submit and query.strip():
+    with st.spinner(f"🔄 {st.session_state.mode}..."):
+        result = call_api(query.strip(), st.session_state.mode)
+        st.session_state.current_result = {
+            "query": query.strip(),
+            "mode": st.session_state.mode,
+            "data": result
+        }
+    st.rerun()
+# =====================================
+# DISPLAY RESULTS
+# =====================================
+if st.session_state.current_result:
+    result = st.session_state.current_result
+    data = result["data"]
+    st.divider()
+    # Query box
+    mode_info = MODES.get(result['mode'], MODES['Automatic'])
+    st.markdown(f"""
+    <div class="query-box">
+        <div class="query-text">{result['query']}</div>
+        <div class="query-mode">{mode_info['icon']} {result['mode']}</div>
+    </div>
+    """, unsafe_allow_html=True)
+    # Sources count
+    sources = data.get("sources", []) or data.get("links", [])
+    if sources:
+        st.success(f"✓ {len(sources)} sources")
+    # Layout - Full width (removed duplicate sidebar sources)
+    tabs = st.tabs(["✨ Answer", "🔗 Sources", "🖼️ Images"])
+    with tabs[0]:
+        answer = data.get("answer", "No answer.")
+        # Display answer directly with markdown
+        st.markdown(answer)
+        followups = data.get("followups", [])
+        if followups:
+            st.markdown("**Related:**")
+            for i, fu in enumerate(followups[:3]):
+                if st.button(f"→ {fu}", key=f"fu_{i}"):
+                    with st.spinner("..."):
+                        new_result = call_api(fu, st.session_state.mode)
+                        st.session_state.current_result = {
+                            "query": fu,
+                            "mode": st.session_state.mode,
+                            "data": new_result
+                        }
+                    st.rerun()
+    with tabs[1]:
+        links = data.get("links", [])
+        if links:
+            for link in links:
+                st.markdown(f"""
+                <div class="source-card">
+                    <a href="{link.get('url','#')}" target="_blank" class="source-title">{link.get('title','Source')}</a>
+                    <div class="source-domain">{get_domain(link.get('url',''))}</div>
+                </div>
+                """, unsafe_allow_html=True)
+        else:
+            st.info("No sources")
+    with tabs[2]:
+        images = data.get("images", [])
+        if images:
+            cols = st.columns(3)
+            for i, img in enumerate(images[:9]):
+                url = img.get("url") or img.get("thumbnail_url")
+                if url:
+                    with cols[i % 3]:
+                        st.image(url, use_container_width=True)
+        else:
+            st.info("No images")
+# =====================================
+# SIDEBAR (for settings)
+# =====================================
+with st.sidebar:
+    st.markdown("### ⚙️ Settings")
+    st.divider()
+    if st.button("🗑️ Clear Chat", use_container_width=True):
+        st.session_state.current_result = None
+        st.session_state.messages = []
+        st.rerun()
+    if st.button("🗑️ Clear Files", use_container_width=True):
+        st.session_state.uploaded_files = []
+        st.info("Files cleared")
+    st.divider()
+    st.caption(f"Theme: {'🌙 Dark' if st.session_state.theme == 'dark' else '☀️ Light'}")
+    st.caption(f"Mode: {st.session_state.mode}")
+    if st.session_state.uploaded_files:
+        st.divider()
+        st.markdown("### 📁 Files")
+        for f in st.session_state.uploaded_files:
+            st.caption(f"📄 {f}")

tools/__init__.py ADDED Viewed

File without changes

tools/browse_tool.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import requests
+import trafilatura
+class BrowseTool:
+    """Downloads and cleans web pages."""
+    def fetch_clean(self, url: str) -> str:
+        try:
+            resp = requests.get(url, timeout=20)
+            resp.raise_for_status()
+            html = resp.text
+            text = trafilatura.extract(
+                html, include_comments=False, include_tables=False
+            )
+            return text or ""
+        except Exception:
+            return ""

tools/citation_tool.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import re
+from typing import List, Dict
+class CitationTool:
+    """Extracts [1], [2]… indices from answer and maps to sources."""
+    _pattern = re.compile(r"\[(\d+)\]")
+    def extract_indices(self, answer: str) -> List[int]:
+        return sorted({int(m.group(1)) for m in self._pattern.finditer(answer)})
+    def attach_sources(self, answer: str, sources: List[Dict]) -> List[Dict]:
+        used = self.extract_indices(answer)
+        mapped: List[Dict] = []
+        for idx in used:
+            if 1 <= idx <= len(sources):
+                mapped.append(sources[idx - 1])
+        return mapped

tools/followup_tool.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from config.config import Config
+class FollowUpGenerator:
+    """
+    Generate 3–5 follow-up suggestions like Perplexity.
+    """
+    def __init__(self):
+        self.llm = Config.get_llm()
+    def generate(self, answer: str, question: str):
+        prompt = f"""
+Given the user question and the assistant answer, generate 3 short follow-up questions the user might ask next.
+Rules:
+- Keep them brief (max 8–12 words)
+- No numbered list
+- No explanations
+- Only return bullet points starting with "•"
+- Must be relevant and helpful
+User question: {question}
+Assistant answer: {answer}
+Generate follow-ups:
+"""
+        resp = self.llm.invoke(prompt).content
+        lines = resp.strip().split("\n")
+        # Only keep bullet lines
+        suggestions = [l.replace("•", "").strip() for l in lines if "•" in l]
+        return suggestions[:4]

tools/image_tavily.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# tools/image_tavily.py
+import os
+from tavily import TavilyClient
+from typing import List, Dict
+class TavilyImageSearch:
+    """
+    Tavily image search API wrapper.
+    """
+    def __init__(self):
+        api_key = os.getenv("TAVILY_API_KEY")
+        if not api_key:
+            raise RuntimeError("Missing TAVILY_API_KEY in environment")
+        self.client = TavilyClient(api_key=api_key)
+    def search(self, query: str, count: int = 6) -> List[Dict]:
+        """
+        Fetch images for a query.
+        """
+        try:
+            # Try the correct Tavily API method - it's get_search_context with search_depth="advanced"
+            resp = self.client.search(
+                query=query,
+                max_results=count,
+                include_images=True,
+                include_answer=False
+            )
+        except Exception as e:
+            print("Tavily image search error:", e)
+            return []
+        images = []
+        raw_images = resp.get("images", [])
+        for item in raw_images:
+            # Handle both dict and direct response formats
+            if isinstance(item, dict):
+                images.append({
+                    "title": item.get("title", item.get("description", "")),
+                    "thumbnail_url": item.get("thumbnail", item.get("thumbnail_url", item.get("url", ""))),
+                    "content_url": item.get("url", item.get("content_url", "")),
+                })
+            else:
+                # Fallback for string URLs
+                images.append({
+                    "title": "",
+                    "thumbnail_url": str(item),
+                    "content_url": str(item),
+                })
+        return images

tools/knowledge_panel.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# tools/knowledge_panel.py
+import requests
+from tavily import TavilyClient
+from typing import Dict, List
+import os
+class KnowledgePanel:
+    """
+    Builds an entity knowledge panel similar to Perplexity:
+    - Top image
+    - Summary
+    - Basic facts
+    - Wikipedia link
+    """
+    def __init__(self):
+        self.client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+    def get_wikipedia_extract(self, query: str) -> Dict:
+        """
+        Returns summary + infobox data from Wikipedia.
+        """
+        try:
+            url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
+            r = requests.get(url, timeout=10)
+            data = r.json()
+            return {
+                "title": data.get("title", ""),
+                "description": data.get("description", ""),
+                "summary": data.get("extract", ""),
+                "thumbnail": data.get("thumbnail", {}).get("source", ""),
+                "url": data.get("content_urls", {}).get("desktop", {}).get("page", "")
+            }
+        except:
+            return {}
+    def get_fast_facts(self, query: str) -> List[str]:
+        """
+        Uses Tavily qna to extract AI-generated facts.
+        """
+        try:
+            resp = self.client.qna(
+                query=f"List 8 short bullet facts about {query}. No explanation, only facts.",
+                n_tokens=150
+            )
+            answer = resp.get("answer", "")
+            # Parse bullet points
+            fact_lines = [line.strip("-• ").strip() for line in answer.split("\n") if line.strip()]
+            return fact_lines[:8]  # Return max 8 facts
+        except:
+            return []
+    def build_panel(self, query: str) -> Dict:
+        """
+        Builds the full knowledge panel.
+        """
+        wiki = self.get_wikipedia_extract(query)
+        facts = self.get_fast_facts(query)
+        return {
+            "wiki": wiki,
+            "facts": facts
+        }

tools/memory_tool.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from typing import Dict, List
+class MemoryTool:
+    """Simple in-memory workspace chat history."""
+    def __init__(self) -> None:
+        self.store: Dict[str, List[Dict[str, str]]] = {}
+        self.profile: Dict[str, Dict[str, str]] = {}  # Store user metadata like name
+    def add(self, workspace_id: str, role: str, content: str) -> None:
+        self.store.setdefault(workspace_id, []).append(
+            {"role": role, "content": content}
+        )
+    def get_context(self, workspace_id: str, max_messages: int = 10) -> str:
+        msgs = self.store.get(workspace_id, [])[-max_messages:]
+        return "\n".join(f"{m['role'].upper()}: {m['content']}" for m in msgs)
+    def get_recent_messages(self, workspace_id: str, limit: int = 6) -> List[Dict[str, str]]:
+        """Get recent messages for LLM context (default last 6 messages)."""
+        return self.store.get(workspace_id, [])[-limit:]
+    def get_long_chat(self, workspace_id: str) -> List[Dict[str, str]]:
+        """Get entire chat history for long-term memory context."""
+        return self.store.get(workspace_id, [])
+    def set_name(self, workspace_id: str, name: str) -> None:
+        """Store user's name in profile."""
+        self.profile[workspace_id] = {"name": name}
+    def get_name(self, workspace_id: str) -> str:
+        """Retrieve user's name from profile."""
+        return self.profile.get(workspace_id, {}).get("name")

tools/name_extractor.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import re
+class NameExtractor:
+    def extract(self, text: str):
+        # Format: "i am naveen" , "my name is naveen"
+        match = re.search(r"(i am|my name is)\s+([A-Za-z]+)", text.lower())
+        if match:
+            return match.group(2).title()
+        return None

tools/name_tool.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import re
+class NameTool:
+    """Extract user names from natural language messages."""
+    def extract_name(self, text: str):
+        """
+        Extract name from sentences like:
+        - i am naveen
+        - I'm Naveen
+        - my name is naveen
+        """
+        text = text.lower()
+        patterns = [
+            r"i am ([a-zA-Z]+)",
+            r"i'm ([a-zA-Z]+)",
+            r"my name is ([a-zA-Z]+)"
+        ]
+        for p in patterns:
+            m = re.search(p, text)
+            if m:
+                name = m.group(1).strip().title()
+                return name
+        return None

tools/reranker_tool.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import List
+from sentence_transformers import CrossEncoder
+from langchain.schema import Document
+class Reranker:
+    """Cross-encoder reranker for retrieved docs."""
+    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2") -> None:
+        self.model = CrossEncoder(model_name)
+    def rerank(self, query: str, docs: List[Document], top_k: int = 5) -> List[Document]:
+        if not docs:
+            return []
+        pairs = [[query, d.page_content] for d in docs]
+        scores = self.model.predict(pairs)
+        scored = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
+        return [d for d, _ in scored[:top_k]]

tools/search_tool.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from typing import List, Dict
+import requests
+from config.config import Config
+class SearchTool:
+    """Tavily web search wrapper."""
+    def __init__(self) -> None:
+        self.api_key = os.getenv("TAVILY_API_KEY") or Config.TAVILY_API_KEY
+        if not self.api_key:
+            raise RuntimeError("TAVILY_API_KEY missing in .env")
+    def search(self, query: str, num_results: int = 5) -> List[Dict]:
+        url = "https://api.tavily.com/search"
+        payload = {"query": query, "num_results": num_results}
+        headers = {"Authorization": self.api_key}
+        resp = requests.post(url, json=payload, headers=headers, timeout=20)
+        resp.raise_for_status()
+        data = resp.json()
+        return data.get("results", [])

tools/summarizer_tool.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Summarization helper using the main LLM."""
+from config.config import Config
+class SummarizerTool:
+    """Summarizes long texts using the LLM."""
+    def __init__(self) -> None:
+        self.llm = Config.get_llm()
+    def summarize(self, text: str, max_words: int = 300) -> str:
+        """
+        Summarize the provided text.
+        Args:
+            text: Input text.
+            max_words: Target summary length.
+        Returns:
+            Summary string.
+        """
+        prompt = (
+            f"Summarize the following text in about {max_words} words:\n\n{text}"
+        )
+        resp = self.llm.invoke(prompt)
+        return resp.content

tools/wiki_tool.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Wikipedia search tool."""
+from langchain_community.utilities import WikipediaAPIWrapper
+class WikiTool:
+    """Wrapper for Wikipedia-based QA."""
+    def __init__(self) -> None:
+        self.api = WikipediaAPIWrapper(top_k_results=3, lang="en")
+    def query(self, query: str) -> str:
+        """Search Wikipedia and return a summarized answer."""
+        return self.api.run(query)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

vectorstore/__init__.py ADDED Viewed

File without changes

vectorstore/store.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import List
+from langchain.schema import Document
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+class VectorStore:
+    """FAISS vector store wrapper."""
+    def __init__(self) -> None:
+        self.embedding = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        self.store: FAISS | None = None
+        self.retriever = None
+    def create(self, docs: List[Document]) -> None:
+        """Create FAISS index from documents."""
+        self.store = FAISS.from_documents(docs, self.embedding)
+        self.retriever = self.store.as_retriever()
+    def retrieve(self, query: str, k: int = 8) -> List[Document]:
+        if self.retriever is None:
+            raise RuntimeError("Vector store not initialized.")
+        return self.retriever.invoke(query)