Spaces:

neel692
/

ChatWithDoc

Sleeping

App Files Files Community

NeelTA commited on Aug 21, 2025

Commit

d2fe6cc

1 Parent(s): 18f151e

initial commit

Browse files

Files changed (12) hide show

.gitignore +225 -0
Dockerfile +16 -0
docHandler.py +141 -0
frontend/css/styles.css +466 -0
frontend/index.html +88 -0
frontend/js/main.js +493 -0
main.py +269 -0
pdfHandler.py +141 -0
requirements.txt +42 -0
task_manager.py +145 -0
txtHandler.py +141 -0
webHandler.py +110 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,225 @@

+# Node.js
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.npm
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# ChatWithDoc specific
+chatWithDocEnv/
+uploaded_files/
+.vscode/
+.env
+.env.local

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

docHandler.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from langchain_community.document_loaders import Docx2txtLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.chat_models import init_chat_model
+from langchain_huggingface import HuggingFaceEmbeddings
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+import os
+from langchain import hub
+from dotenv import load_dotenv
+from langgraph.graph import START, StateGraph
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from langchain.docstore.document import Document
+load_dotenv()
+class State(BaseModel):
+    question: str = Field(..., description="Type your question here")
+    context: List[Document] = Field(
+        default_factory=list,
+        description="A list of Document objects",
+    )
+    answer: str = Field(default="", description="Answer will be here")
+class DocProcessor:
+    def __init__(self):
+        # Load model provider
+        if not os.environ.get("GOOGLE_API_KEY"):
+            raise ValueError("Google Gemini API key not found in environment variables")
+        self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"}
+        )
+        self.prompt = hub.pull("rlm/rag-prompt")
+        self.vector_store = None
+        self.chunk_size = 1000
+        self.chunk_overlap = 200
+    def process_docx(self, file_path: str) -> Dict[str, Any]:
+        """
+        Process a DOCX file and prepare it for querying
+        Args:
+            file_path (str): Path to the DOCX file
+        Returns:
+            Dict[str, Any]: Processing status and information
+        """
+        try:
+            # Document Loading
+            loader = Docx2txtLoader(file_path)
+            pages = loader.load()
+            # Text Splitting
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            texts = text_splitter.split_documents(pages)
+            # Vector Store Setup
+            embedding_dim = len(self.embedding_model.embed_query("test"))
+            index = faiss.IndexFlatL2(embedding_dim)
+            self.vector_store = FAISS(
+                embedding_function=self.embedding_model,
+                index=index,
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+            )
+            # Index chunks
+            self.vector_store.add_documents(documents=texts)
+            return {
+                "status": "success",
+                "message": "DOCX processed successfully",
+                "num_pages": len(pages),
+                "num_chunks": len(texts)
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error processing DOCX: {str(e)}"
+            }
+    def query_response(self, query: str) -> Dict[str, Any]:
+        """
+        Query the processed document
+        Args:
+            query (str): The question to ask about the document
+        Returns:
+            Dict[str, Any]: Answer and relevant context
+        """
+        if not self.vector_store:
+            return {
+                "status": "error",
+                "message": "No document has been processed yet"
+            }
+        try:
+            # Create state graph
+            graph_builder = StateGraph(State)
+            # Define retrieval step
+            def retrieve(state: State):
+                retrieved_docs = self.vector_store.similarity_search(state.question)
+                return {"context": retrieved_docs}
+            # Define generation step
+            def generate(state: State):
+                docs_content = "\n\n".join(doc.page_content for doc in state.context)
+                messages = self.prompt.invoke({
+                    "question": state.question,
+                    "context": docs_content
+                })
+                response = self.llm.invoke(messages)
+                return {"answer": response.content}
+            # Build and compile the graph
+            graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
+            # Execute the query
+            response = graph.invoke({
+                "question": query
+            })
+            return {
+                "status": "success",
+                "answer": response["answer"],
+                "query": query
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error querying document: {str(e)}"
+            }

frontend/css/styles.css ADDED Viewed

	@@ -0,0 +1,466 @@

+:root {
+    --primary: #4361ee;
+    --primary-light: #4895ef;
+    --secondary: #3f37c9;
+    --accent: #4cc9f0;
+    --light: #f8f9fa;
+    --dark: #212529;
+    --success: #4ade80;
+    --warning: #facc15;
+    --danger: #f87171;
+    --gray: #6c757d;
+    --light-gray: #e9ecef;
+    --border-radius: 12px;
+    --shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
+    --transition: all 0.3s ease;
+}
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+body {
+    background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%);
+    color: var(--dark);
+    min-height: 100vh;
+    padding: 20px;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+}
+.container {
+    width: 100%;
+    max-width: 1200px;
+    margin: 0 auto;
+}
+header {
+    text-align: center;
+    padding: 30px 0;
+    animation: fadeIn 0.8s ease-out;
+}
+header h1 {
+    font-size: 2.8rem;
+    margin-bottom: 10px;
+    color: var(--secondary);
+    background: linear-gradient(90deg, var(--primary), var(--accent));
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent;
+}
+header p {
+    font-size: 1.2rem;
+    color: var(--gray);
+    max-width: 600px;
+    margin: 0 auto;
+}
+.app-container {
+    display: flex;
+    gap: 30px;
+    margin-top: 20px;
+}
+@media (max-width: 900px) {
+    .app-container {
+        flex-direction: column;
+    }
+}
+.input-section {
+    flex: 1;
+    background: white;
+    border-radius: var(--border-radius);
+    padding: 25px;
+    box-shadow: var(--shadow);
+    animation: slideInLeft 0.6s ease-out;
+}
+.chat-section {
+    flex: 1.5;
+    display: flex;
+    flex-direction: column;
+    background: white;
+    border-radius: var(--border-radius);
+    box-shadow: var(--shadow);
+    overflow: hidden;
+    animation: slideInRight 0.6s ease-out;
+}
+.section-title {
+    font-size: 1.5rem;
+    margin-bottom: 20px;
+    color: var(--secondary);
+    display: flex;
+    align-items: center;
+    gap: 10px;
+}
+.section-title i {
+    background: var(--light-gray);
+    width: 40px;
+    height: 40px;
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.upload-area {
+    border: 2px dashed var(--light-gray);
+    border-radius: var(--border-radius);
+    padding: 30px;
+    text-align: center;
+    margin-bottom: 25px;
+    transition: var(--transition);
+    cursor: pointer;
+}
+.upload-area:hover {
+    border-color: var(--primary);
+    background: rgba(67, 97, 238, 0.05);
+}
+.upload-area i {
+    font-size: 3rem;
+    color: var(--primary);
+    margin-bottom: 15px;
+}
+.upload-area h3 {
+    margin-bottom: 10px;
+    color: var(--dark);
+}
+.upload-area p {
+    color: var(--gray);
+    margin-bottom: 20px;
+}
+.file-types {
+    display: flex;
+    justify-content: center;
+    gap: 15px;
+    margin-top: 15px;
+}
+.file-type {
+    background: var(--light-gray);
+    padding: 8px 15px;
+    border-radius: 30px;
+    font-size: 0.9rem;
+}
+.url-input {
+    margin-bottom: 25px;
+}
+.url-input label {
+    display: block;
+    margin-bottom: 8px;
+    font-weight: 500;
+}
+.url-input input {
+    width: 100%;
+    padding: 14px;
+    border: 1px solid var(--light-gray);
+    border-radius: var(--border-radius);
+    font-size: 1rem;
+    transition: var(--transition);
+}
+.url-input input:focus {
+    outline: none;
+    border-color: var(--primary);
+    box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.2);
+}
+.btn {
+    background: var(--primary);
+    color: white;
+    border: none;
+    padding: 14px 25px;
+    border-radius: var(--border-radius);
+    font-size: 1rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: var(--transition);
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    gap: 8px;
+}
+.btn:hover {
+    background: var(--secondary);
+    transform: translateY(-2px);
+}
+.btn:active {
+    transform: translateY(0);
+}
+.btn-block {
+    width: 100%;
+}
+.btn-outline {
+    background: transparent;
+    border: 2px solid var(--primary);
+    color: var(--primary);
+}
+.btn-outline:hover {
+    background: var(--primary);
+    color: white;
+}
+.file-list {
+    margin-top: 25px;
+}
+.file-item {
+    display: flex;
+    align-items: center;
+    padding: 12px 15px;
+    background: var(--light-gray);
+    border-radius: var(--border-radius);
+    margin-bottom: 10px;
+    animation: fadeIn 0.3s ease-out;
+}
+.file-item i {
+    margin-right: 12px;
+    color: var(--primary);
+}
+.file-info {
+    flex: 1;
+}
+.file-name {
+    font-weight: 500;
+    margin-bottom: 3px;
+}
+.file-size {
+    font-size: 0.85rem;
+    color: var(--gray);
+}
+.file-actions {
+    display: flex;
+    gap: 10px;
+}
+.file-actions button {
+    background: none;
+    border: none;
+    color: var(--gray);
+    cursor: pointer;
+    font-size: 1.1rem;
+    transition: var(--transition);
+}
+.file-actions button:hover {
+    color: var(--danger);
+}
+.chat-header {
+    background: var(--primary);
+    color: white;
+    padding: 20px;
+    display: flex;
+    align-items: center;
+    gap: 15px;
+}
+.chat-header img {
+    width: 50px;
+    height: 50px;
+    border-radius: 50%;
+    background: white;
+    padding: 5px;
+}
+.chat-messages {
+    flex: 1;
+    padding: 25px;
+    overflow-y: auto;
+    display: flex;
+    flex-direction: column;
+    gap: 20px;
+    background: #f8fafc;
+}
+.message {
+    max-width: 80%;
+    padding: 18px;
+    border-radius: var(--border-radius);
+    animation: fadeIn 0.3s ease-out;
+    position: relative;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
+}
+.user-message {
+    background: var(--primary-light);
+    color: white;
+    align-self: flex-end;
+    border-bottom-right-radius: 5px;
+}
+.bot-message {
+    background: white;
+    border: 1px solid var(--light-gray);
+    align-self: flex-start;
+    border-bottom-left-radius: 5px;
+}
+.message-header {
+    display: flex;
+    align-items: center;
+    margin-bottom: 8px;
+    font-weight: 600;
+}
+.message-header i {
+    margin-right: 8px;
+}
+.message-content {
+    line-height: 1.5;
+}
+.typing-indicator {
+    display: flex;
+    align-items: center;
+    padding: 18px;
+    background: white;
+    border: 1px solid var(--light-gray);
+    border-radius: var(--border-radius);
+    align-self: flex-start;
+    border-bottom-left-radius: 5px;
+    width: 100px;
+}
+.typing-dot {
+    width: 8px;
+    height: 8px;
+    background: var(--gray);
+    border-radius: 50%;
+    margin: 0 3px;
+    animation: typing 1.4s infinite ease-in-out;
+}
+.typing-dot:nth-child(1) { animation-delay: 0s; }
+.typing-dot:nth-child(2) { animation-delay: 0.2s; }
+.typing-dot:nth-child(3) { animation-delay: 0.4s; }
+.chat-input {
+    display: flex;
+    padding: 20px;
+    background: white;
+    border-top: 1px solid var(--light-gray);
+}
+.chat-input input {
+    flex: 1;
+    padding: 16px;
+    border: 1px solid var(--light-gray);
+    border-radius: 30px;
+    font-size: 1rem;
+    transition: var(--transition);
+}
+.chat-input input:focus {
+    outline: none;
+    border-color: var(--primary);
+    box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.2);
+}
+.chat-input button {
+    background: var(--primary);
+    color: white;
+    border: none;
+    width: 50px;
+    height: 50px;
+    border-radius: 50%;
+    margin-left: 15px;
+    cursor: pointer;
+    transition: var(--transition);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.chat-input button:hover {
+    background: var(--secondary);
+    transform: scale(1.05);
+}
+@keyframes fadeIn {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+@keyframes slideInLeft {
+    from { opacity: 0; transform: translateX(-30px); }
+    to { opacity: 1; transform: translateX(0); }
+}
+@keyframes slideInRight {
+    from { opacity: 0; transform: translateX(30px); }
+    to { opacity: 1; transform: translateX(0); }
+}
+@keyframes typing {
+    0%, 60%, 100% { transform: translateY(0); }
+    30% { transform: translateY(-5px); }
+}
+.processing {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    padding: 30px;
+    color: var(--gray);
+}
+.processing i {
+    font-size: 2rem;
+    margin-right: 15px;
+    color: var(--primary);
+    animation: spin 1.5s linear infinite;
+}
+@keyframes spin {
+    0% { transform: rotate(0deg); }
+    100% { transform: rotate(360deg); }
+}
+footer {
+    text-align: center;
+    padding: 30px 0;
+    color: var(--gray);
+    font-size: 0.9rem;
+    margin-top: auto;
+}
+.pulse {
+    animation: pulse 2s infinite;
+}
+@keyframes pulse {
+    0% { box-shadow: 0 0 0 0 rgba(67, 97, 238, 0.4); }
+    70% { box-shadow: 0 0 0 10px rgba(67, 97, 238, 0); }
+    100% { box-shadow: 0 0 0 0 rgba(67, 97, 238, 0); }
+}

frontend/index.html ADDED Viewed

	@@ -0,0 +1,88 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ChatWithDoc - Chat with Your Documents</title>
+    <link rel="stylesheet" href="css/styles.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1><i class="fas fa-robot"></i> ChatWithDoc</h1>
+            <p>Upload documents or enter URLs, then chat with your content using AI</p>
+        </header>
+        <div class="app-container">
+            <div class="input-section">
+                <h2 class="section-title"><i class="fas fa-file-upload"></i> Upload Documents</h2>
+                <div class="upload-area" id="uploadArea">
+                    <i class="fas fa-cloud-upload-alt"></i>
+                    <h3>Drag & Drop Files Here</h3>
+                    <p>Supports PDF, DOC, DOCX, TXT files</p>
+                    <button class="btn btn-outline">Browse Files</button>
+                    <input type="file" id="fileInput" multiple accept=".pdf,.doc,.docx,.txt" style="display: none;">
+                    <div class="file-types">
+                        <div class="file-type">PDF</div>
+                        <div class="file-type">DOC</div>
+                        <div class="file-type">DOCX</div>
+                        <div class="file-type">TXT</div>
+                    </div>
+                </div>
+                <div class="file-list" id="fileList">
+                    <!-- File items will be added here dynamically -->
+                </div>
+                <div class="url-input">
+                    <label for="urlInput"><i class="fas fa-link"></i> Or Enter a Web Page URL</label>
+                    <input type="url" id="urlInput" placeholder="https://example.com/article">
+                </div>
+                <button class="btn btn-block pulse" id="processBtn">
+                    <i class="fas fa-cogs"></i> Process Documents & URLs
+                </button>
+            </div>
+            <div class="chat-section">
+                <div class="chat-header">
+                    <img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'%3E%3Cpath fill='%234361ee' d='M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm-1 15h-2v-6h2v6zm3 0h-2v-6h2v6zm3 0h-2v-6h2v6z'/%3E%3C/svg%3E" alt="AI Assistant">
+                    <div>
+                        <h2>ChatWithDoc Assistant</h2>
+                        <p>Ask me anything about your documents</p>
+                    </div>
+                </div>
+                <div class="chat-messages" id="chatMessages">
+                    <div class="message bot-message">
+                        <div class="message-header">
+                            <i class="fas fa-robot"></i> ChatWithDoc Assistant
+                        </div>
+                        <div class="message-content">
+                            Hello! I'm your document assistant. Upload some documents or enter URLs, then ask me anything about their content. I'll help you find answers quickly.
+                        </div>
+                    </div>
+                </div>
+                <div class="chat-input">
+                    <input type="text" id="messageInput" placeholder="Ask about your documents...">
+                    <button id="sendButton">
+                        <i class="fas fa-paper-plane"></i>
+                    </button>
+                </div>
+            </div>
+        </div>
+        <footer>
+            <p>ChatWithDoc - Chat with your documents using AI</p>
+        </footer>
+    </div>
+    <script src="js/main.js"></script>
+</body>
+</html>

frontend/js/main.js ADDED Viewed

	@@ -0,0 +1,493 @@

+// DOM Elements
+const uploadArea = document.getElementById('uploadArea');
+const fileInput = document.getElementById('fileInput');
+const urlInput = document.getElementById('urlInput');
+const processBtn = document.getElementById('processBtn');
+const fileList = document.getElementById('fileList');
+const chatMessages = document.getElementById('chatMessages');
+const messageInput = document.getElementById('messageInput');
+const sendButton = document.getElementById('sendButton');
+// API Base URL
+const API_BASE = '/';
+console.log('JavaScript loaded successfully');
+// Event Listeners
+uploadArea.addEventListener('click', () => {
+    console.log('Upload area clicked');
+    fileInput.click();
+});
+fileInput.addEventListener('change', (e) => {
+    console.log('File input changed');
+    const files = e.target.files;
+    console.log('Files detected:', files.length); // Debug log
+    if (files.length > 0) {
+        console.log('Files selected:', files.length);
+        // Clear previous documents and UI first
+        clearPreviousDocuments();
+        // Store files in an array BEFORE clearing input
+        const fileArray = Array.from(files);
+        console.log('Files stored in array:', fileArray.length);
+        // Now process each file
+        fileArray.forEach((file, index) => {
+            console.log(`Processing file ${index + 1}:`, file.name, 'Type:', file.type);
+            uploadFile(file);
+        });
+    } else {
+        console.log('No files detected in change event');
+    }
+});
+processBtn.addEventListener('click', () => {
+    console.log('Process button clicked');
+    processAllDocuments();
+});
+sendButton.addEventListener('click', () => {
+    console.log('Send button clicked');
+    sendMessage();
+});
+messageInput.addEventListener('keypress', (e) => {
+    if (e.key === 'Enter') {
+        console.log('Enter key pressed');
+        sendMessage();
+    }
+});
+// Separate function for clearing previous documents (doesn't clear current input)
+function clearPreviousDocuments() {
+    console.log('Clearing previous documents');
+    // Clear the file list UI
+    fileList.innerHTML = '';
+    // Clear URL input
+    urlInput.value = '';
+    // Clear chat messages and restore initial state
+    chatMessages.innerHTML = `
+        <div class="message bot-message">
+            <div class="message-header">
+                <i class="fas fa-robot"></i> ChatWithDoc Assistant
+            </div>
+            <div class="message-content">
+                Previous documents cleared. Ready for new uploads!
+            </div>
+        </div>
+    `;
+    // Call backend to clear previous documents
+    fetch(`${API_BASE}clear-documents`, {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json'
+        }
+    })
+    .then(response => {
+        console.log('Clear documents response status:', response.status);
+        return response.json();
+    })
+    .then(data => {
+        console.log('Previous documents cleared:', data);
+    })
+    .catch(error => {
+        console.error('Error clearing documents:', error);
+    });
+}
+// Function for complete reset (used by clear button if you add one)
+function clearAllFilesSync() {
+    console.log('Clearing all files completely');
+    // Clear the file input (only call this when you want to reset everything)
+    fileInput.value = '';
+    // Clear everything else
+    clearPreviousDocuments();
+}
+function uploadFile(file) {
+    console.log('Starting file upload for:', file.name);
+    const formData = new FormData();
+    formData.append('file', file);
+    // Show file in UI immediately
+    addFileToList(file.name, formatFileSize(file.size), 'uploading');
+    console.log('Making fetch request to:', `${API_BASE}upload`);
+    fetch(`${API_BASE}upload`, {
+        method: 'POST',
+        body: formData
+    })
+    .then(response => {
+        console.log('Upload response status:', response.status);
+        if (!response.ok) {
+            throw new Error(`HTTP error! status: ${response.status}`);
+        }
+        return response.json();
+    })
+    .then(data => {
+        console.log('Upload response data:', data);
+        if (data.error) {
+            updateFileStatus(file.name, 'error');
+            alert('Error uploading file: ' + data.error);
+        } else {
+            updateFileStatus(file.name, 'uploaded');
+            console.log('File uploaded successfully:', file.name);
+        }
+    })
+    .catch(error => {
+        console.error('Upload error:', error);
+        updateFileStatus(file.name, 'error');
+        alert('Error uploading file: ' + error.message);
+    });
+}
+function processAllDocuments() {
+    console.log('Processing all documents');
+    const url = urlInput.value.trim();
+    const files = document.querySelectorAll('.file-item');
+    console.log('URL:', url, 'Files count:', files.length);
+    if (files.length === 0 && !url) {
+        alert('Please upload files or enter a URL first');
+        return;
+    }
+    // Show processing animation
+    showProcessing();
+    // Process all uploaded files
+    let filePromise = Promise.resolve();
+    if (files.length > 0) {
+        // Update status to processing
+        files.forEach(fileItem => {
+            const fileName = fileItem.dataset.filename;
+            updateFileStatus(fileName, 'processing');
+        });
+        console.log('Calling process-documents endpoint');
+        filePromise = fetch(`${API_BASE}process-documents`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            }
+        })
+        .then(response => {
+            console.log('Process documents response status:', response.status);
+            if (!response.ok) {
+                throw new Error(`HTTP error! status: ${response.status}`);
+            }
+            return response.json();
+        })
+        .then(data => {
+            console.log('Process documents response:', data);
+            if (data.error) {
+                throw new Error(data.error);
+            }
+            // Mark all files as processed
+            files.forEach(fileItem => {
+                const fileName = fileItem.dataset.filename;
+                updateFileStatus(fileName, 'processed');
+            });
+            addBotMessage(`Successfully processed ${data.processed_count} files!`);
+            return data;
+        });
+    }
+    // Process URL if provided
+    let urlPromise = Promise.resolve();
+    if (url) {
+        console.log('Processing URL:', url);
+        urlPromise = fetch(`${API_BASE}process-url`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({ url: url })
+        })
+        .then(response => {
+            console.log('Process URL response status:', response.status);
+            return response.json();
+        })
+        .then(data => {
+            console.log('Process URL response:', data);
+            if (data.error) {
+                throw new Error(data.error);
+            }
+            addBotMessage(`URL processed successfully! Found ${data.document_info.num_pages} pages with ${data.document_info.num_chunks} text chunks.`);
+            return data;
+        });
+    }
+    // Wait for all processing to complete
+    Promise.all([filePromise, urlPromise])
+        .then(() => {
+            console.log('All processing completed');
+            hideProcessing();
+            addBotMessage("All documents and URLs have been processed successfully! You can now ask questions about them.");
+        })
+        .catch(error => {
+            console.error('Processing error:', error);
+            hideProcessing();
+            alert('Error processing documents: ' + error.message);
+        });
+}
+function sendMessage() {
+    const message = messageInput.value.trim();
+    console.log('Sending message:', message);
+    if (message) {
+        addUserMessage(message);
+        messageInput.value = '';
+        // Show typing indicator
+        showTypingIndicator();
+        fetch(`${API_BASE}chat`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({ message: message })
+        })
+        .then(response => {
+            console.log('Chat response status:', response.status);
+            return response.json();
+        })
+        .then(data => {
+            console.log('Chat response:', data);
+            hideTypingIndicator();
+            if (data.error) {
+                addBotMessage("Sorry, I encountered an error: " + data.error);
+            } else {
+                addBotMessage(data.response);
+            }
+        })
+        .catch(error => {
+            console.error('Chat error:', error);
+            hideTypingIndicator();
+            addBotMessage("Sorry, I encountered an error processing your request.");
+        });
+    }
+}
+function addFileToList(name, size, status = 'success') {
+    console.log('Adding file to list:', name, 'Status:', status);
+    const fileItem = document.createElement('div');
+    fileItem.className = 'file-item';
+    fileItem.dataset.filename = name;
+    let statusIcon = '';
+    if (status === 'uploading') {
+        statusIcon = '<i class="fas fa-spinner fa-spin"></i>';
+    } else if (status === 'processing') {
+        statusIcon = '<i class="fas fa-cog fa-spin"></i>';
+    } else if (status === 'error') {
+        statusIcon = '<i class="fas fa-exclamation-circle" style="color: var(--danger);"></i>';
+    } else if (status === 'processed') {
+        statusIcon = '<i class="fas fa-check-circle" style="color: var(--success);"></i>';
+    } else if (status === 'uploaded') {
+        statusIcon = '<i class="fas fa-file-alt"></i>';
+    } else {
+        statusIcon = '<i class="fas fa-file-alt"></i>';
+    }
+    fileItem.innerHTML = `
+        ${statusIcon}
+        <div class="file-info">
+            <div class="file-name">${name}</div>
+            <div class="file-size">${size}</div>
+        </div>
+        <div class="file-actions">
+            <button title="Remove"><i class="fas fa-times"></i></button>
+        </div>
+    `;
+    fileList.appendChild(fileItem);
+    // Add remove functionality
+    fileItem.querySelector('.file-actions button').addEventListener('click', () => {
+        console.log('Removing file:', name);
+        fileItem.remove();
+    });
+}
+function updateFileStatus(name, status) {
+    console.log('Updating file status:', name, 'to', status);
+    const fileItems = document.querySelectorAll('.file-item');
+    fileItems.forEach(item => {
+        if (item.dataset.filename === name) {
+            let statusIcon = '';
+            if (status === 'uploading') {
+                statusIcon = '<i class="fas fa-spinner fa-spin"></i>';
+            } else if (status === 'processing') {
+                statusIcon = '<i class="fas fa-cog fa-spin"></i>';
+            } else if (status === 'error') {
+                statusIcon = '<i class="fas fa-exclamation-circle" style="color: var(--danger);"></i>';
+            } else if (status === 'processed') {
+                statusIcon = '<i class="fas fa-check-circle" style="color: var(--success);"></i>';
+            } else if (status === 'uploaded') {
+                statusIcon = '<i class="fas fa-file-alt"></i>';
+            } else {
+                statusIcon = '<i class="fas fa-file-alt"></i>';
+            }
+            const iconElement = item.querySelector('i');
+            if (iconElement) {
+                iconElement.outerHTML = statusIcon;
+            }
+        }
+    });
+}
+function formatFileSize(bytes) {
+    if (bytes === 0) return '0 Bytes';
+    const k = 1024;
+    const sizes = ['Bytes', 'KB', 'MB', 'GB'];
+    const i = Math.floor(Math.log(bytes) / Math.log(k));
+    return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+}
+function addUserMessage(text) {
+    const messageDiv = document.createElement('div');
+    messageDiv.className = 'message user-message';
+    messageDiv.innerHTML = `
+        <div class="message-header">
+            <i class="fas fa-user"></i> You
+        </div>
+        <div class="message-content">${text}</div>
+    `;
+    chatMessages.appendChild(messageDiv);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+}
+function addBotMessage(text) {
+    const messageDiv = document.createElement('div');
+    // Convert markdown to HTML
+    const markdownHTML = DOMPurify.sanitize(marked.parse(text));
+    messageDiv.className = 'message bot-message';
+    messageDiv.innerHTML = `
+        <div class="message-header">
+            <i class="fas fa-robot"></i> ChatWithDoc Assistant
+        </div>
+        <div class="message-content">${markdownHTML}</div>
+    `;
+    chatMessages.appendChild(messageDiv);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+}
+function showTypingIndicator() {
+    const typingIndicator = document.createElement('div');
+    typingIndicator.className = 'typing-indicator';
+    typingIndicator.id = 'typingIndicator';
+    typingIndicator.innerHTML = `
+        <div class="typing-dot"></div>
+        <div class="typing-dot"></div>
+        <div class="typing-dot"></div>
+    `;
+    chatMessages.appendChild(typingIndicator);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+}
+function hideTypingIndicator() {
+    const typingIndicator = document.getElementById('typingIndicator');
+    if (typingIndicator) {
+        typingIndicator.remove();
+    }
+}
+function showProcessing() {
+    console.log('Showing processing indicator');
+    const processingDiv = document.createElement('div');
+    processingDiv.className = 'processing';
+    processingDiv.id = 'processingIndicator';
+    processingDiv.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing documents and URLs...';
+    // Replace chat messages with processing indicator
+    chatMessages.innerHTML = '';
+    chatMessages.appendChild(processingDiv);
+}
+function hideProcessing() {
+    console.log('Hiding processing indicator');
+    const processingIndicator = document.getElementById('processingIndicator');
+    if (processingIndicator) {
+        processingIndicator.remove();
+    }
+    // Restore initial message
+    chatMessages.innerHTML = `
+        <div class="message bot-message">
+            <div class="message-header">
+                <i class="fas fa-robot"></i> ChatWithDoc Assistant
+            </div>
+            <div class="message-content">
+                Hello! I'm your document assistant. Upload some documents or enter URLs, then ask me anything about their content. I'll help you find answers quickly.
+            </div>
+        </div>
+    `;
+}
+// Drag and drop functionality
+uploadArea.addEventListener('dragover', (e) => {
+    e.preventDefault();
+    console.log('Drag over upload area');
+    uploadArea.style.borderColor = 'var(--primary)';
+    uploadArea.style.backgroundColor = 'rgba(67, 97, 238, 0.1)';
+});
+uploadArea.addEventListener('dragleave', () => {
+    console.log('Drag leave upload area');
+    uploadArea.style.borderColor = 'var(--light-gray)';
+    uploadArea.style.backgroundColor = '';
+});
+uploadArea.addEventListener('drop', (e) => {
+    e.preventDefault();
+    console.log('Files dropped on upload area');
+    uploadArea.style.borderColor = 'var(--light-gray)';
+    uploadArea.style.backgroundColor = '';
+    const files = e.dataTransfer.files;
+    console.log('Dropped files count:', files.length);
+    if (files.length > 0) {
+        // Clear previous documents first
+        clearPreviousDocuments();
+        // Store files in array before processing
+        const fileArray = Array.from(files);
+        // Process each file
+        fileArray.forEach(file => {
+            console.log('Processing dropped file:', file.name, 'Type:', file.type);
+            if (file.type === 'application/pdf' ||
+                file.type === 'application/msword' ||
+                file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
+                file.type === 'text/plain') {
+                uploadFile(file);
+            } else {
+                console.log('Unsupported file type:', file.type);
+                alert(`Unsupported file type: ${file.type}. Please upload PDF, DOC, DOCX, or TXT files.`);
+            }
+        });
+    }
+});
+console.log('All event listeners attached successfully');

main.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+import shutil
+import os
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List
+from task_manager import DocumentManager
+import warnings
+# Disable all LangSmith related warnings
+warnings.filterwarnings("ignore", message=".*LangSmith.*")
+warnings.filterwarnings("ignore", message=".*API key.*")
+# Also disable UserAgent warning
+os.environ["LANGCHAIN_USER_AGENT"] = "ChatWithDoc/1.0"
+app = FastAPI()
+# Initialize document manager
+doc_manager = DocumentManager()
+# Store uploaded files temporarily before processing
+uploaded_files = []
+class UploadResponse(BaseModel):
+    message: str
+    document_info: Dict[str, Any]
+class URLRequest(BaseModel):
+    url: str = Field(..., description="URL of the document to process")
+class ChatRequest(BaseModel):
+    message: str = Field(..., description="User's question")
+class ChatResponse(BaseModel):
+    response: str = Field(..., description="Answer to the user's question")
+class ProcessResponse(BaseModel):
+    message: str
+    processed_count: int
+    errors: List[str] = []
+# Allow CORS (update this with your frontend URL in production)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Change to your React frontend URL in prod
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+UPLOAD_DIR = "uploaded_files"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@app.post("/upload")
+async def upload_file(file: UploadFile = File(...)):
+    """
+    Upload a document (just stores it, doesn't process yet)
+    - **file**: The document file to upload (PDF, DOCX, TXT)
+    - Returns upload confirmation
+    """
+    print(f"Received file: {file.filename} of type {file.content_type}")
+    # Get file extension and determine content type
+    file_extension = file.filename.lower().split('.')[-1] if '.' in file.filename else ''
+    # Map file extensions to content types
+    extension_to_type = {
+        'pdf': 'application/pdf',
+        'txt': 'text/plain',
+        'doc': 'application/msword',
+        'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+    }
+    # Use file extension to determine content type if content_type is not reliable
+    if file.content_type and file.content_type in ["application/pdf", "text/plain", "application/msword",
+                                                   "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
+        content_type = file.content_type
+    elif file_extension in extension_to_type:
+        content_type = extension_to_type[file_extension]
+    else:
+        return JSONResponse(status_code=400, content={"error": f"Unsupported file type: {file_extension}"})
+    print(f"Using content type: {content_type}")
+    file_location = os.path.join(UPLOAD_DIR, file.filename)
+    # Save the file
+    with open(file_location, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    # Store file info for later processing
+    file_info = {
+        "filename": file.filename,
+        "file_location": file_location,
+        "content_type": content_type
+    }
+    uploaded_files.append(file_info)
+    print("File uploaded successfully, ready for processing")
+    return UploadResponse(
+        message="File uploaded successfully",
+        document_info={
+            "filename": file.filename,
+            "content_type": content_type,
+            "status": "uploaded",
+            "location": file_location
+        }
+    )
+@app.post("/process-documents")
+async def process_documents():
+    """
+    Process all uploaded files using your processor architecture
+    """
+    try:
+        if not uploaded_files:
+            return JSONResponse(status_code=400, content={"error": "No files uploaded"})
+        processed_count = 0
+        errors = []
+        # Process each uploaded file
+        for file_info in uploaded_files:
+            try:
+                result = doc_manager.process_document(file_info["file_location"], file_info["content_type"])
+                if result["status"] == "success":
+                    processed_count += 1
+                    print(f"Successfully processed: {file_info['filename']}")
+                else:
+                    error_msg = f"{file_info['filename']}: {result['message']}"
+                    errors.append(error_msg)
+                    print(f"Failed to process {file_info['filename']}: {result['message']}")
+            except Exception as e:
+                error_msg = f"{file_info['filename']}: {str(e)}"
+                errors.append(error_msg)
+                print(f"Exception processing {file_info['filename']}: {e}")
+        # Clear uploaded files list after processing attempt
+        uploaded_files.clear()
+        if processed_count == 0:
+            return JSONResponse(status_code=400, content={
+                "error": f"Failed to process any files. Errors: {'; '.join(errors)}"
+            })
+        response_message = f"Successfully processed {processed_count} files"
+        if errors:
+            response_message += f". {len(errors)} files had errors."
+        return ProcessResponse(
+            message=response_message,
+            processed_count=processed_count,
+            errors=errors
+        )
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+@app.post("/process-url")
+async def process_url(url_request: URLRequest):
+    """
+    Process a document from URL using your web processor
+    - **url**: The URL of the document to process
+    - Returns document processing information
+    """
+    url = url_request.url
+    try:
+        # Process the URL using your web processor
+        result = doc_manager.process_url(url)
+        print("URL processing result:", result)
+        if result["status"] == "error":
+            return JSONResponse(status_code=400, content={"error": result["message"]})
+        return UploadResponse(
+            message="URL processed successfully",
+            document_info={
+                "url": url,
+                "status": "processed",
+                "type": "url",
+                "title": result.get("title", "Untitled"),
+                "num_pages": result.get("num_pages", 0),
+                "num_chunks": result.get("num_chunks", 0),
+                "word_count": result.get("word_count", 0)
+            }
+        )
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+@app.post("/chat")
+async def chat_with_doc(chat_request: ChatRequest):
+    """
+    Process a query against processed documents using your processors
+    - **query**: The user's question
+    - Returns an answer
+    """
+    try:
+        print(f"Received query: {chat_request.message}")
+        result = doc_manager.query_document(chat_request.message)
+        print("Query result:", result)
+        if result["status"] == "error":
+            return JSONResponse(status_code=400, content={"error": result["message"]})
+        return ChatResponse(
+            response=result["answer"]
+        )
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+@app.post("/clear-documents")
+async def clear_documents():
+    """
+    Clear all previously processed documents and uploaded files
+    """
+    print("Clearing all documents...")
+    try:
+        doc_manager.clear_documents()
+        uploaded_files.clear()
+        return {"message": "Documents cleared successfully"}
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+@app.get("/status")
+async def get_status():
+    """
+    Get current status of uploaded and processed documents
+    """
+    try:
+        # Get status from your document manager if it has a get_status method
+        if hasattr(doc_manager, 'get_status'):
+            doc_status = doc_manager.get_status()
+        else:
+            # Fallback for original single-document architecture
+            doc_status = {
+                "total_documents": 1 if hasattr(doc_manager, 'current_processor') and doc_manager.current_processor else 0,
+                "current_document": getattr(doc_manager, 'current_document', None)
+            }
+        return {
+            "uploaded_files": len(uploaded_files),
+            "status": doc_status
+        }
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    """Simple health check"""
+    return {"status": "healthy", "message": "ChatWithDoc API is running"}
+# Mount the frontend directory as a static path.
+# This should be after all API routes to ensure they are not overridden.
+app.mount("/", StaticFiles(directory="frontend", html=True), name="frontend")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

pdfHandler.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.chat_models import init_chat_model
+from langchain_huggingface import HuggingFaceEmbeddings
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+import os
+from langchain import hub
+from dotenv import load_dotenv
+from langgraph.graph import START, StateGraph
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from langchain.docstore.document import Document
+load_dotenv()
+class State(BaseModel):
+    question: str = Field(..., description="Type your question here")
+    context: List[Document] = Field(
+        default_factory=list,
+        description="A list of Document objects",
+    )
+    answer: str = Field(default="", description="Answer will be here")
+class PDFProcessor:
+    def __init__(self):
+        # Load model provider
+        if not os.environ.get("GOOGLE_API_KEY"):
+            raise ValueError("Google Gemini API key not found in environment variables")
+        self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"}
+        )
+        self.prompt = hub.pull("rlm/rag-prompt")
+        self.vector_store = None
+        self.chunk_size = 1000
+        self.chunk_overlap = 200
+    def process_pdf(self, file_path: str) -> Dict[str, Any]:
+        """
+        Process a PDF file and prepare it for querying
+        Args:
+            file_path (str): Path to the PDF file
+        Returns:
+            Dict[str, Any]: Processing status and information
+        """
+        try:
+            print(f"Processing PDF file: {file_path}")
+            # Document Loading
+            loader = PyPDFLoader(file_path)
+            pages = loader.load()
+            # Text Splitting
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            texts = text_splitter.split_documents(pages)
+            # Vector Store Setup
+            embedding_dim = len(self.embedding_model.embed_query("test"))
+            index = faiss.IndexFlatL2(embedding_dim)
+            self.vector_store = FAISS(
+                embedding_function=self.embedding_model,
+                index=index,
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+            )
+            # Index chunks
+            self.vector_store.add_documents(documents=texts)
+            return {
+                "status": "success",
+                "message": "PDF processed successfully",
+                "num_pages": len(pages),
+                "num_chunks": len(texts)
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error processing PDF: {str(e)}"
+            }
+    def query_response(self, query: str) -> Dict[str, Any]:
+        """
+        Query the processed document
+        Args:
+            query (str): The question to ask about the document
+        Returns:
+            Dict[str, Any]: Answer and relevant context
+        """
+        if not self.vector_store:
+            return {
+                "status": "error",
+                "message": "No document has been processed yet"
+            }
+        try:
+            # Create state graph
+            graph_builder = StateGraph(State)
+            # Define retrieval step
+            def retrieve(state: State):
+                retrieved_docs = self.vector_store.similarity_search(state.question)
+                return {"context": retrieved_docs}
+            # Define generation step
+            def generate(state: State):
+                docs_content = "\n\n".join(doc.page_content for doc in state.context)
+                messages = self.prompt.invoke({
+                    "question": state.question,
+                    "context": docs_content
+                })
+                response = self.llm.invoke(messages)
+                return {"answer": response.content}
+            # Build and compile the graph
+            graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
+            # Execute the query
+            response = graph.invoke({
+                "question": query
+            })
+            return {
+                "status": "success",
+                "answer": response["answer"],
+                "query": query
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error querying document: {str(e)}"
+            }

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+# Core LangChain dependencies
+langchain
+langchain-community
+langchain-core
+langchain-text-splitters
+langchain-google-genai
+langgraph
+grandalf
+# FastAPI and web framework
+fastapi
+uvicorn[standard]
+python-multipart
+pydantic
+# Document processing
+PyPDF2
+python-docx
+docx2txt
+unstructured
+beautifulsoup4
+requests
+# Embeddings and vector stores - Fixed versions
+sentence-transformers
+faiss-cpu
+numpy
+torch
+transformers
+huggingface-hub
+# Additional utilities
+python-magic
+python-magic-bin
+pypdf
+lxml
+# Web scraping
+requests
+beautifulsoup4

task_manager.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from pdfHandler import PDFProcessor
+from docHandler import DocProcessor
+from txtHandler import TextProcessor
+from webHandler import WebProcessor
+from typing import Dict, Any, List
+class DocumentManager:
+    def __init__(self):
+        self.pdf_processor = PDFProcessor()
+        self.doc_processor = DocProcessor()
+        self.txt_processor = TextProcessor()
+        self.web_processor = WebProcessor()
+        # Store multiple processed documents
+        self.processed_documents = []  # List of {"processor": processor, "file_path": path, "content_type": type}
+        self.all_content = ""  # Combined content for multi-document queries
+    def process_document(self, file_path: str, content_type: str) -> Dict[str, Any]:
+        try:
+            result = {"status": "error", "message": "Unknown file type"}
+            processor = None
+            print(f"Processing file: {file_path} with content type: {content_type}")
+            if content_type == "application/pdf":
+                result = self.pdf_processor.process_pdf(file_path)
+                processor = self.pdf_processor
+            elif content_type == "application/msword":
+                result = self.doc_processor.process_docx(file_path)
+                processor = self.doc_processor
+            elif content_type == "text/plain":
+                result = self.txt_processor.process_text(file_path)
+                processor = self.txt_processor
+            elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+                result = self.doc_processor.process_docx(file_path)
+                processor = self.doc_processor
+            if result["status"] == "success" and processor:
+                # Add to processed documents list
+                doc_info = {
+                    "processor": processor,
+                    "file_path": file_path,
+                    "content_type": content_type,
+                    "filename": file_path.split('/')[-1]  # Extract filename
+                }
+                self.processed_documents.append(doc_info)
+                # Update combined content for multi-document queries
+                # Assuming processors have a method to get content
+                try:
+                    if hasattr(processor, 'get_content'):
+                        content = processor.get_content()
+                        self.all_content += f"\n\n--- Document: {doc_info['filename']} ---\n{content}"
+                except:
+                    pass
+                print(f"Document added to collection. Total documents: {len(self.processed_documents)}")
+            return result
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    def query_document(self, query: str) -> Dict[str, Any]:
+        if not self.processed_documents:
+            return {"status": "error", "message": "No documents processed"}
+        print(f"Querying {len(self.processed_documents)} documents with question: {query}")
+        try:
+            # Strategy 1: Try to query each document and combine results
+            all_responses = []
+            for i, doc_info in enumerate(self.processed_documents):
+                processor = doc_info["processor"]
+                filename = doc_info["filename"]
+                just_filename = filename.split('\\')[-1]
+                # Query individual document
+                try:
+                    response = processor.query_response(query)
+                    if response.get("status") == "success":
+                        answer = response.get("answer", "")
+                        if answer and answer.strip():
+                            all_responses.append(f"From {just_filename}:\n {answer}")
+                except Exception as e:
+                    print(f"Error querying {filename}: {e}")
+                    continue
+            if not all_responses:
+                return {"status": "error", "message": "No relevant information found in any documents"}
+            # Combine all responses
+            combined_answer = "\n\n".join(all_responses)
+            return {
+                "status": "success",
+                "answer": combined_answer
+            }
+        except Exception as e:
+            # Fallback: Use the last processed document
+            print(f"Multi-document query failed, using last document: {e}")
+            last_processor = self.processed_documents[-1]["processor"]
+            return last_processor.query_response(query)
+    def clear_documents(self):
+        """Clear all previously processed documents"""
+        self.processed_documents = []
+        self.all_content = ""
+        print("All documents cleared - ready for new uploads")
+    def process_url(self, url: str) -> Dict[str, Any]:
+        """Process a URL and add it to the document collection"""
+        try:
+            result = self.web_processor.process_url(url)
+            if result["status"] == "success":
+                # Add URL to processed documents
+                doc_info = {
+                    "processor": self.web_processor,
+                    "file_path": url,
+                    "content_type": "text/html",
+                    "filename": f"webpage_{url.split('/')[-1] or 'index'}"
+                }
+                self.processed_documents.append(doc_info)
+                # Update combined content
+                try:
+                    if hasattr(self.web_processor, 'get_content'):
+                        content = self.web_processor.get_content()
+                        self.all_content += f"\n\n--- Web Page: {url} ---\n{content}"
+                except:
+                    pass
+                print(f"URL processed and added to collection: {url}")
+            return result
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    def get_status(self) -> Dict[str, Any]:
+        """Get current status of processed documents"""
+        return {
+            "total_documents": len(self.processed_documents),
+            "document_types": list(set([doc["content_type"] for doc in self.processed_documents])),
+            "filenames": [doc["filename"] for doc in self.processed_documents]
+        }

txtHandler.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.chat_models import init_chat_model
+from langchain_huggingface import HuggingFaceEmbeddings
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+import os
+from langchain import hub
+from dotenv import load_dotenv
+from langgraph.graph import START, StateGraph
+from typing import List, Dict, Any, Optional
+from pydantic import BaseModel, Field
+from langchain.docstore.document import Document
+load_dotenv()
+class State(BaseModel):
+    question: str = Field(..., description="Type your question here")
+    context: List[Document] = Field(
+        default_factory=list,
+        description="A list of Document objects",
+    )
+    answer: str = Field(default="", description="Answer will be here")
+class TextProcessor:
+    def __init__(self):
+        # Load model provider
+        if not os.environ.get("GOOGLE_API_KEY"):
+            raise ValueError("Google Gemini API key not found in environment variables")
+        self.llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
+        self.embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"}
+        )
+        self.prompt = hub.pull("rlm/rag-prompt")
+        self.vector_store = None
+        self.chunk_size = 1000
+        self.chunk_overlap = 200
+    def process_text(self, file_path: str) -> Dict[str, Any]:
+        """
+        Process a text file and prepare it for querying
+        Args:
+            file_path (str): Path to the text file
+        Returns:
+            Dict[str, Any]: Processing status and information
+        """
+        try:
+            # Document Loading
+            loader = TextLoader(file_path, encoding='utf-8')
+            pages = loader.load()
+            # Text Splitting
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            texts = text_splitter.split_documents(pages)
+            # Vector Store Setup
+            embedding_dim = len(self.embedding_model.embed_query("test"))
+            index = faiss.IndexFlatL2(embedding_dim)
+            self.vector_store = FAISS(
+                embedding_function=self.embedding_model,
+                index=index,
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+            )
+            # Index chunks
+            self.vector_store.add_documents(documents=texts)
+            return {
+                "status": "success",
+                "message": "Text file processed successfully",
+                "num_pages": len(pages),
+                "num_chunks": len(texts)
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error processing text file: {str(e)}"
+            }
+    def query_response(self, query: str) -> Dict[str, Any]:
+        """
+        Query the processed document
+        Args:
+            query (str): The question to ask about the document
+        Returns:
+            Dict[str, Any]: Answer and relevant context
+        """
+        if not self.vector_store:
+            return {
+                "status": "error",
+                "message": "No document has been processed yet"
+            }
+        try:
+            # Create state graph
+            graph_builder = StateGraph(State)
+            # Define retrieval step
+            def retrieve(state: State):
+                retrieved_docs = self.vector_store.similarity_search(state.question)
+                return {"context": retrieved_docs}
+            # Define generation step
+            def generate(state: State):
+                docs_content = "\n\n".join(doc.page_content for doc in state.context)
+                messages = self.prompt.invoke({
+                    "question": state.question,
+                    "context": docs_content
+                })
+                response = self.llm.invoke(messages)
+                return {"answer": response.content}
+            # Build and compile the graph
+            graph = graph_builder.add_sequence([retrieve, generate]).set_entry_point("retrieve").compile()
+            # Execute the query
+            response = graph.invoke({
+                "question": query
+            })
+            return {
+                "status": "success",
+                "answer": response["answer"],
+                "query": query
+            }
+        except Exception as e:
+            return {
+                "status": "error",
+                "message": f"Error querying document: {str(e)}"
+            }

webHandler.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import requests
+from bs4 import BeautifulSoup
+from typing import Dict, Any
+class WebProcessor:
+    def __init__(self):
+        self.content = ""
+        self.url = ""
+    def process_url(self, url: str) -> Dict[str, Any]:
+        """Process a web page URL"""
+        try:
+            # Set headers to mimic a real browser
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
+            }
+            # Fetch the webpage
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            # Parse with BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
+                element.decompose()
+            # Extract text content
+            text_content = soup.get_text()
+            # Clean up the text
+            lines = (line.strip() for line in text_content.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text_content = ' '.join(chunk for chunk in chunks if chunk)
+            if not text_content.strip():
+                return {"status": "error", "message": "No text content could be extracted from the webpage"}
+            # Extract title
+            title = soup.find('title')
+            page_title = title.get_text().strip() if title else "Untitled"
+            self.content = text_content.strip()
+            self.url = url
+            return {
+                "status": "success",
+                "message": "Web page processed successfully",
+                "title": page_title,
+                "num_pages": 1,
+                "num_chunks": len(text_content.split()) // 100 + 1,
+                "word_count": len(text_content.split())
+            }
+        except requests.exceptions.RequestException as e:
+            return {"status": "error", "message": f"Failed to fetch webpage: {str(e)}"}
+        except Exception as e:
+            return {"status": "error", "message": f"Error processing webpage: {str(e)}"}
+    def query_response(self, query: str) -> Dict[str, Any]:
+        """Answer a query about the web content"""
+        if not self.content:
+            return {"status": "error", "message": "No web content available"}
+        try:
+            # Simple keyword-based search
+            answer = self._search_content(query, self.content)
+            return {
+                "status": "success",
+                "answer": answer
+            }
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    def get_content(self) -> str:
+        """Get the extracted content"""
+        return self.content
+    def _search_content(self, query: str, content: str) -> str:
+        """Simple keyword-based search"""
+        query_words = query.lower().split()
+        # Split content into sentences
+        sentences = []
+        for sentence in content.split('.'):
+            sentence = sentence.strip()
+            if len(sentence) > 10:  # Filter out very short fragments
+                sentences.append(sentence)
+        # Find relevant sentences
+        relevant_sentences = []
+        for sentence in sentences:
+            sentence_lower = sentence.lower()
+            score = sum(1 for word in query_words if word in sentence_lower)
+            if score > 0:
+                relevant_sentences.append((sentence, score))
+        if not relevant_sentences:
+            return "I couldn't find information related to your query on this webpage."
+        # Sort by relevance and return top sentences
+        relevant_sentences.sort(key=lambda x: x[1], reverse=True)
+        top_sentences = [sent[0] for sent in relevant_sentences[:3]]
+        return ". ".join(top_sentences)