hacketthadwin commited on
Commit
8ae0191
·
verified ·
1 Parent(s): 373e584

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitignore +207 -0
  2. Dockerfile +18 -0
  3. LICENSE +21 -0
  4. README.md +197 -8
  5. app.py +259 -0
  6. requirements.txt +17 -0
.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /code
6
+
7
+ # Copy the requirements file into the container
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install the packages specified in requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Copy the rest of your application's code into the container
14
+ COPY . /code/
15
+
16
+ # Command to run the application.
17
+ # Hugging Face Spaces expects the app to run on port 7860.
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Adarsh Jha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,201 @@
1
  ---
2
- title: Intelligent Document QNA Api
3
- emoji: 🏆
4
- colorFrom: green
5
- colorTo: blue
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: A high-performance API that allows you to find answers withi
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Intelligent Document QnA API
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
 
 
8
  ---
9
 
10
+ # Intelligent Document Q&A API
11
+ [![Status](https://img.shields.io/badge/status-complete-brightgreen)](#)
12
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](#)
13
+ [![Framework](https://img.shields.io/badge/framework-FastAPI-blue)](#)
14
+ [![LangChain](https://img.shields.io/badge/LangChain-enabled-yellow)](#)
15
+ [![PostgreSQL](https://img.shields.io/badge/db-Supabase%20%2B%20pgvector-009688)](#)
16
+ [![Model](https://img.shields.io/badge/AI-GeminiAI%20%2B%20GoogleAI-orange)](#)
17
+
18
+
19
+ A high-performance API that allows you to find answers within your documents using powerful language models. Ingest PDFs, DOCX files, or emails, and get back precise answers to your questions.
20
+
21
+ This project is designed to be simple to set up and use, acting as a robust backend for any application that needs document-based question-answering capabilities.
22
+
23
+ ---
24
+
25
+ ### 🚀 What Makes This Project Different?
26
+
27
+ Unlike many existing RAG APIs, this project is:
28
+
29
+ - **Model-Agnostic and Multi-Provider Friendly**
30
+ Supports both **Google Gemini** and **Gemini AI** out of the box — no hard dependency on OpenAI.
31
+
32
+ - **Cloud-Ready and Free-Tier Optimized**
33
+ Specifically engineered to run smoothly on platforms like **Render**, with memory-efficient caching and lazy model loading.
34
+
35
+ - **Format-Intelligent**
36
+ Automatically detects and uses the correct loader for `.pdf`, `.docx`, and `.eml` files — no manual preprocessing required.
37
+
38
+ - **Minimal Memory Footprint**
39
+ Designed for low-resource environments — ideal for free-tier deployments, research prototypes, or student projects.
40
+
41
+ - **Clean JSON Output**
42
+ Filters out verbose LLM reasoning ("Thought: Let's find the answer...") and returns only the clean, relevant answers.
43
+
44
+ This makes it ideal for developers, students, and startups looking to build document Q&A apps without the complexity or cost of large RAG systems.
45
+
46
+ ---
47
+
48
+ ## ✨ Features
49
+
50
+ - **Multi-Format Support**
51
+ Natively handles `.pdf`, `.docx`, and `.eml` files.
52
+
53
+ - **Persistent Storage**
54
+ Uses **Supabase** with `pgvector` to store document embeddings. Process a document once and query it instantly anytime after.
55
+
56
+ - **High-Quality Answers**
57
+ Leverages state-of-the-art language models from **Gemini AI** and **Google** for accurate embeddings and intelligent Q&A.
58
+
59
+ - **Asynchronous & Fast**
60
+ Built with **FastAPI** for high-performance, non-blocking I/O.
61
+
62
+ - **Easy to Deploy**
63
+ Ready to be containerized with **Docker** or deployed to any modern cloud platform.
64
+
65
+
66
+ ---
67
+
68
+ ## 🚀 Getting Started
69
+
70
+ Follow these steps to get the API server running on your local machine.
71
+
72
+ ### ✅ Prerequisites
73
+
74
+ - Python 3.8+
75
+ - A Supabase account with a project created
76
+ - API keys from Google AI Studio and Gemini AI
77
+
78
+ ---
79
+
80
+ ### 📁 1. Clone the Repository
81
+
82
+ ```bash
83
+ git clone https://github.com/hacketthadwin/intelligent-document-qna-api.git
84
+ cd intelligent-document-qna-api
85
+ ```
86
+
87
+ ---
88
+
89
+ ### 🗃️ 2. Set Up Your Supabase Database
90
+
91
+ Enable the `vector` extension in your Supabase project:
92
+
93
+ 1. Go to your Supabase project dashboard
94
+ 2. Navigate to the **SQL Editor**
95
+ 3. Run the following SQL command:
96
+
97
+ ```sql
98
+ create extension if not exists vector;
99
+ ```
100
+
101
+ A `documents` table will be automatically created the first time a document is processed via LangChain.
102
+
103
+ ---
104
+
105
+ ### 🔐 3. Configure Environment Variables
106
+
107
+ Create a `.env` file in the root of your project directory. Use this template:
108
+
109
+ ```env
110
+ # --- Service Keys ---
111
+ GOOGLE_API_KEY="your_google_api_key_here"
112
+ # --- Supabase Credentials for Vector Store ---
113
+ SUPABASE_URL="https://your_supabase_project_id.supabase.co"
114
+ SUPABASE_SERVICE_KEY="your_supabase_service_role_key_here"
115
+ ```
116
+
117
+ ---
118
+
119
+ ### 📦 4. Install Dependencies
120
+
121
+ Install required packages using pip:
122
+
123
+ ```bash
124
+ pip install -r requirements.txt
125
+ ```
126
+
127
+ ---
128
+
129
+ ### ▶️ 5. Run the API Server
130
+
131
+ Start the FastAPI server with:
132
+
133
+ ```bash
134
+ python -m uvicorn main:app --reload
135
+ ```
136
+
137
+ Visit [http://127.0.0.1:8000](http://127.0.0.1:8000) to access the API.
138
+ Interactive docs available at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
139
+
140
+ ---
141
+
142
+ ## ⚙️ API Usage
143
+
144
+ ### `POST /query`
145
+
146
+ This endpoint ingests a document (if it's new) and answers questions about it.
147
+
148
+ ---
149
+
150
+ ### 📤 Request Body
151
+
152
+ - `document_url` (string, required): Public URL to the document you want to query
153
+ - `questions` (array of strings, required): One or more questions to ask
154
+
155
+ #### ✅ Example using `curl`
156
+
157
+ ```bash
158
+ curl -X POST "http://127.0.0.1:8000/query" \
159
+ -H "Content-Type: application/json" \
160
+ -d '{
161
+ "document_url": "https://arxiv.org/pdf/1706.03762.pdf",
162
+ "questions": [
163
+ "What is the title of this paper?",
164
+ "Summarize the abstract in one sentence."
165
+ ]
166
+ }'
167
+ ```
168
+
169
+ ---
170
+
171
+ ### 📥 Example Success Response
172
+
173
+ ```json
174
+ {
175
+ "answers": [
176
+ "The title of the paper is 'Attention Is All You Need'.",
177
+ "The abstract introduces the Transformer, a new network architecture based solely on attention mechanisms that is more parallelizable and requires significantly less time to train than existing models."
178
+ ],
179
+ "document_url": "https://arxiv.org/pdf/1706.03762.pdf",
180
+ "message": "New document processed and vectors stored in database."
181
+ }
182
+ ```
183
+
184
+ ---
185
+
186
+ ## 🤝 Contributing
187
+
188
+ Contributions are welcome! If you have ideas for features or improvements:
189
+
190
+ - Open an issue to discuss
191
+ - Fork the repository
192
+ - Create a new branch
193
+ - Make your changes
194
+ - Submit a pull request
195
+
196
+ ---
197
+
198
+ ## 📄 License
199
+
200
+ This project is licensed under the MIT License. See the [LICENSE](./LICENSE) file for more details.
201
+
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To run this application:
2
+ # 1. Make sure you have a .env file with your service keys (e.g., GOOGLE_API_KEY, SUPABASE_URL, TOGETHER_API_KEY, SUPABASE_SERVICE_KEY).
3
+ # 2. Ensure your Supabase database is set up with the 'vector' extension.
4
+ # 3. Install required packages: pip install -r requirements.txt
5
+ # 4. Run the server: python -m uvicorn main:app --reload
6
+
7
+ import os
8
+ import tempfile
9
+ import json
10
+ import requests
11
+ import asyncio
12
+ import traceback
13
+ from dotenv import load_dotenv
14
+ from typing import List, Optional
15
+
16
+ # --- FastAPI and Pydantic Imports ---
17
+ from fastapi import FastAPI, HTTPException, status
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel, Field, HttpUrl
20
+
21
+ # --- Langchain and related imports ---
22
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredEmailLoader
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain_community.vectorstores import SupabaseVectorStore
25
+ from langchain_together import ChatTogether
26
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
27
+ from langchain.chains import RetrievalQA
28
+ from langchain.prompts import PromptTemplate
29
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
30
+
31
+ # --- Supabase Imports ---
32
+ from supabase.client import Client, create_client
33
+
34
+ # --- 1. INITIALIZATION & CONFIGURATION ---
35
+
36
+ load_dotenv()
37
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
38
+ SUPABASE_URL = os.environ.get("SUPABASE_URL")
39
+ SUPABASE_SERVICE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
40
+
41
+ # --- FastAPI App Initialization ---
42
+ app = FastAPI(
43
+ title="Intelligent Document Q&A API",
44
+ description="An API to find answers within documents (PDF, DOCX, EML). It ingests documents from a URL, stores them in a persistent vector database, and allows you to ask multiple questions.",
45
+ version="1.1.0"
46
+ )
47
+
48
+ app.add_middleware(
49
+ CORSMiddleware,
50
+ allow_origins=["*"],
51
+ allow_credentials=True,
52
+ allow_methods=["*"],
53
+ allow_headers=["*"],
54
+ )
55
+
56
+ # --- Lazy Loading for Models and Clients ---
57
+ llm: Optional[ChatTogether] = None
58
+ gemini_embedder: Optional[GoogleGenerativeAIEmbeddings] = None
59
+ supabase_client: Optional[Client] = None
60
+
61
+ # --- 2. STARTUP EVENT ---
62
+
63
+ @app.on_event("startup")
64
+ async def startup_event():
65
+ """On application startup, initialize components."""
66
+ print("Application startup: Initializing components...")
67
+ initialize_components()
68
+
69
+ def initialize_components():
70
+ """Initializes models and clients if they haven't been already."""
71
+ global llm, gemini_embedder, supabase_client
72
+ if llm is None:
73
+ print("Initializing Gemini AI LLM...")
74
+ llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.2, convert_system_message_to_human=True)
75
+ if gemini_embedder is None:
76
+ print("Initializing Google Gemini Embedding model...")
77
+ gemini_embedder = GoogleGenerativeAIEmbeddings(
78
+ model="models/embedding-001",
79
+ safety_settings={
80
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
81
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
82
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
83
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
84
+ }
85
+ )
86
+ if supabase_client is None and SUPABASE_URL and SUPABASE_SERVICE_KEY:
87
+ print("Initializing Supabase client...")
88
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
89
+ print("Models and clients are ready.")
90
+
91
+
92
+ # --- Prompt Template for the LLM ---
93
+ PROMPT_TEMPLATE = PromptTemplate(
94
+ template="""
95
+ **Your Role:** You are a helpful AI assistant. Your task is to answer the user's question based *only* on the provided document context.
96
+ **Strict Constraints:**
97
+ - Your answer **MUST** be derived solely from the `Document Context`. Do not use any external knowledge.
98
+ - If the answer is not in the context, state: "The answer to this question could not be found in the provided document."
99
+ ---
100
+ **Document Context:** {context}
101
+ ---
102
+ **User Query:** {question}
103
+ ---
104
+ **Helpful Answer:**
105
+ """,
106
+ input_variables=["question", "context"]
107
+ )
108
+
109
+
110
+ # --- Pydantic Models for Request and Response ---
111
+ class QueryRequest(BaseModel):
112
+ document_url: HttpUrl = Field(..., description="A single public URL to a document (PDF, DOCX, EML).")
113
+ questions: List[str] = Field(..., min_length=1, description="A non-empty list of questions to ask about the document.")
114
+
115
+
116
+ class QueryResponse(BaseModel):
117
+ answers: List[str]
118
+ document_url: HttpUrl
119
+ message: str
120
+
121
+
122
+ # --- 3. CORE API ENDPOINT ---
123
+ @app.post('/query', response_model=QueryResponse, tags=["Document Q&A"])
124
+ async def query_document(payload: QueryRequest):
125
+ """
126
+ This endpoint ingests a document if new, then finds answers to questions within it.
127
+ - If a document has been processed before, it uses the cached version from the vector store.
128
+ - Otherwise, it downloads, processes, and stores the document for future queries.
129
+ """
130
+ print(f"\n--- New Request Received for /query ---")
131
+
132
+ if not supabase_client:
133
+ raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Database client is not available. Check Supabase credentials.")
134
+
135
+ doc_url = str(payload.document_url)
136
+ questions = payload.questions
137
+ print(f"Processing document from URL: {doc_url}")
138
+ print(f"Answering {len(questions)} questions.")
139
+
140
+ vectorstore = None
141
+ ingestion_message = ""
142
+
143
+ try:
144
+ # --- Check for Existing Embeddings in Supabase ---
145
+ print("Step 1: Checking for existing document vectors in Supabase...")
146
+ response = supabase_client.from_("documents").select("id", count='exact').eq("metadata->>source", doc_url).limit(1).execute()
147
+
148
+ if response.count > 0:
149
+ print("DATABASE HIT: Found pre-processed vectors. Skipping ingestion.")
150
+ ingestion_message = "Document already processed. Using existing vectors from database."
151
+ vectorstore = SupabaseVectorStore(
152
+ client=supabase_client,
153
+ embedding=gemini_embedder,
154
+ table_name="documents",
155
+ query_name="match_documents"
156
+ )
157
+ else:
158
+ print("DATABASE MISS: Processing and embedding new document.")
159
+ ingestion_message = "New document processed and vectors stored in database."
160
+ temp_file_path = None
161
+ try:
162
+ print("Step 1a: Downloading document...")
163
+ http_response = requests.get(doc_url)
164
+ http_response.raise_for_status()
165
+
166
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".tmp") as temp_file:
167
+ temp_file.write(http_response.content)
168
+ temp_file_path = temp_file.name
169
+
170
+ lower_doc_url = doc_url.lower()
171
+ if lower_doc_url.endswith('.pdf'): loader = PyPDFLoader(temp_file_path)
172
+ elif lower_doc_url.endswith('.docx'): loader = Docx2txtLoader(temp_file_path)
173
+ elif lower_doc_url.endswith('.eml'): loader = UnstructuredEmailLoader(temp_file_path)
174
+ else: loader = PyPDFLoader(temp_file_path)
175
+
176
+ pages = loader.load()
177
+ if not pages: raise ValueError("Could not load content from the document.")
178
+ print(f"Step 2: Loaded {len(pages)} pages/sections.")
179
+
180
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
181
+ docs = text_splitter.split_documents(pages)
182
+ if not docs: raise ValueError("Document could not be split into processable chunks.")
183
+
184
+ for doc in docs:
185
+ doc.metadata = {"source": doc_url}
186
+
187
+ print(f"Step 3: Split document into {len(docs)} chunks. Uploading to Supabase...")
188
+ vectorstore = await SupabaseVectorStore.afrom_documents(
189
+ documents=docs,
190
+ embedding=gemini_embedder,
191
+ client=supabase_client,
192
+ table_name="documents",
193
+ query_name="match_documents",
194
+ chunk_size=50
195
+ )
196
+ print("Step 4: Supabase vector store created successfully.")
197
+
198
+ finally:
199
+ if temp_file_path and os.path.exists(temp_file_path):
200
+ os.remove(temp_file_path)
201
+ print(f"Temporary file removed: {temp_file_path}")
202
+
203
+ # --- Create Retriever and QA Chain ---
204
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 15, "filter": {"source": doc_url}})
205
+ print("Step 5: Initialized retriever with source document filter.")
206
+
207
+ qa_chain = RetrievalQA.from_chain_type(
208
+ llm=llm,
209
+ chain_type="stuff",
210
+ retriever=retriever,
211
+ return_source_documents=False,
212
+ chain_type_kwargs={"prompt": PROMPT_TEMPLATE}
213
+ )
214
+ print("Step 6: RAG QA Chain created.")
215
+
216
+ except Exception as e:
217
+ print(f"ERROR during document processing or vector store setup: {e}")
218
+ traceback.print_exc()
219
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to process document: {str(e)}")
220
+
221
+ # --- Question Answering ---
222
+ try:
223
+ async def get_answer(chain, query):
224
+ print(f"Processing query: '{query}'")
225
+ try:
226
+ result = await chain.ainvoke({"query": query})
227
+ answer = result.get('result', 'Error: Could not process this question.').strip()
228
+ print(f"Successfully answered: '{query}'")
229
+ return answer
230
+ except Exception as e:
231
+ error_message = f"Error for query '{query}': {str(e)}"
232
+ print(f"ERROR invoking chain: {error_message}")
233
+ traceback.print_exc()
234
+ return error_message
235
+
236
+ print(f"Step 7: Starting parallel processing for {len(questions)} questions...")
237
+ tasks = [get_answer(qa_chain, q) for q in questions]
238
+ answers = await asyncio.gather(*tasks)
239
+ print("All questions processed successfully.")
240
+
241
+ final_response = {
242
+ "answers": answers,
243
+ "document_url": doc_url,
244
+ "message": ingestion_message
245
+ }
246
+ print(f"Final response prepared: {json.dumps(final_response, indent=2)}")
247
+ return final_response
248
+
249
+ except Exception as e:
250
+ print(f"ERROR during question answering: {e}")
251
+ traceback.print_exc()
252
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during question answering: {str(e)}")
253
+
254
+
255
+ # --- Health Check Endpoint ---
256
+ @app.get("/", tags=["General"])
257
+ def read_root():
258
+ """A simple health check endpoint to confirm the API is running."""
259
+ return {"status": "ok", "name": "Intelligent Document Q&A API"}
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-dotenv
4
+ pydantic
5
+ requests
6
+ httpx
7
+ langchain
8
+ langchain-community
9
+ langchain-together
10
+ langchain-google-genai
11
+ google-generativeai
12
+ pypdf
13
+ docx2txt
14
+ unstructured-client
15
+ unstructured[eml]
16
+ supabase
17
+ psycopg2-binary