nrigheriu commited on
Commit
869f31e
·
verified ·
1 Parent(s): 8971f7a

added app files

Browse files
Files changed (7) hide show
  1. README_HF.md +35 -0
  2. app.py +4 -6
  3. custom_types.py +21 -0
  4. data_loader.py +36 -0
  5. gradio_app.py +254 -0
  6. requirements.txt +8 -0
  7. vector_db.py +59 -0
README_HF.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG PDF Chat Application
2
+
3
+ A powerful Retrieval-Augmented Generation (RAG) application that allows you to upload PDF documents and ask questions about their content using AI.
4
+
5
+ ## Features
6
+
7
+ - **PDF Upload**: Upload PDF documents and automatically process them into searchable chunks
8
+ - **AI-Powered Q&A**: Ask questions about your uploaded PDFs and get intelligent answers
9
+ - **Vector Search**: Uses advanced embedding technology to find relevant information
10
+ - **Source Tracking**: See which parts of your documents contributed to each answer
11
+
12
+ ## How to Use
13
+
14
+ 1. **Upload a PDF**: Go to the "Upload PDF" tab and select a PDF file from your computer
15
+ 2. **Wait for Processing**: The app will automatically chunk and embed your document
16
+ 3. **Ask Questions**: Switch to the "Ask Questions" tab and enter your questions
17
+ 4. **Get Answers**: Receive AI-generated answers based on your document content
18
+
19
+ ## Technical Details
20
+
21
+ - **Vector Database**: Uses Qdrant for efficient similarity search
22
+ - **Embeddings**: OpenAI's text-embedding-3-large model for document chunking
23
+ - **Language Model**: GPT-4 for generating intelligent answers
24
+ - **Framework**: Built with Gradio for easy deployment
25
+
26
+ ## Environment Variables
27
+
28
+ Make sure to set your OpenAI API key:
29
+ ```
30
+ OPENAI_API_KEY=your_openai_api_key_here
31
+ ```
32
+
33
+ ## Deployment
34
+
35
+ This app is designed to run on Hugging Face Spaces. Simply push this repository to a Hugging Face Space and it will automatically deploy.
app.py CHANGED
@@ -1,7 +1,5 @@
1
- import gradio as gr
 
2
 
3
- def greet(name):
4
- return f"Hello, {name}!"
5
-
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ # Import the main Gradio app
2
+ from gradio_app import demo
3
 
4
+ if __name__ == "__main__":
5
+ demo.launch()
 
 
 
custom_types.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pydantic
2
+
3
+
4
+ class RAGChunkAndSrc(pydantic.BaseModel):
5
+ chunks: list[str]
6
+ source_id: str = None
7
+
8
+
9
+ class RAGUpsertResult(pydantic.BaseModel):
10
+ ingested: int
11
+
12
+
13
+ class RAGSearchResult(pydantic.BaseModel):
14
+ contexts: list[str]
15
+ sources: list[str]
16
+
17
+
18
+ class RAQQueryResult(pydantic.BaseModel):
19
+ answer: str
20
+ sources: list[str]
21
+ num_contexts: int
data_loader.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from llama_index.readers.file import PDFReader
3
+ from llama_index.core.node_parser import SentenceSplitter
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ client = OpenAI()
9
+ EMBED_MODEL = "text-embedding-3-large"
10
+ EMBED_DIM = 3072
11
+
12
+ splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
13
+
14
+ def load_and_chunk_pdf(path: str):
15
+ docs = PDFReader().load_data(file=path)
16
+ texts = [d.text for d in docs if getattr(d, "text", None)]
17
+ chunks = []
18
+ for t in texts:
19
+ new_chunks = splitter.split_text(t)
20
+ # Filter out empty chunks
21
+ chunks.extend([chunk for chunk in new_chunks if chunk.strip()])
22
+ return chunks
23
+
24
+
25
+ def embed_texts(texts: list[str]) -> list[list[float]]:
26
+ # Double-check that we don't have empty texts
27
+ texts = [text for text in texts if text and text.strip()]
28
+
29
+ if not texts:
30
+ return []
31
+
32
+ response = client.embeddings.create(
33
+ model=EMBED_MODEL,
34
+ input=texts,
35
+ )
36
+ return [item.embedding for item in response.data]
gradio_app.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ import threading
4
+ import time
5
+ from pathlib import Path
6
+ import uuid
7
+ import os
8
+ from dotenv import load_dotenv
9
+
10
+ # Import your existing modules
11
+ from data_loader import load_and_chunk_pdf, embed_texts
12
+ from vector_db import QdrantStorage
13
+ from custom_types import RAGSearchResult
14
+ from openai import OpenAI
15
+
16
+ load_dotenv()
17
+
18
+ # Initialize OpenAI client
19
+ openai_client = OpenAI()
20
+
21
+ class RAGProcessor:
22
+ def __init__(self):
23
+ self.vector_store = QdrantStorage()
24
+ self.uploads_dir = Path("uploads")
25
+ self.uploads_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ def save_uploaded_pdf(self, file) -> Path:
28
+ """Save uploaded PDF file with unique name"""
29
+ unique_id = str(uuid.uuid4())[:8]
30
+ file_stem = Path(file.name).stem
31
+ file_suffix = Path(file.name).suffix
32
+ unique_filename = f"{file_stem}_{unique_id}{file_suffix}"
33
+
34
+ file_path = self.uploads_dir / unique_filename
35
+ file_bytes = file.getbuffer()
36
+ file_path.write_bytes(file_bytes)
37
+ return file_path
38
+
39
+ def ingest_pdf(self, pdf_path: Path) -> str:
40
+ """Process and ingest PDF into vector database"""
41
+ try:
42
+ # Load and chunk the PDF
43
+ chunks = load_and_chunk_pdf(str(pdf_path))
44
+
45
+ # Generate embeddings
46
+ embeddings = embed_texts(chunks)
47
+
48
+ # Generate unique IDs
49
+ source_id = pdf_path.stem
50
+ ids = [str(uuid.uuid5(uuid.NAMESPACE_URL, f"{source_id}:{i}")) for i in range(len(chunks))]
51
+
52
+ # Create payloads
53
+ payloads = [{"source": source_id, "text": chunks[i]} for i in range(len(chunks))]
54
+
55
+ # Upsert to vector database
56
+ self.vector_store.upsert(ids, embeddings, payloads)
57
+
58
+ return f"Successfully ingested {len(chunks)} chunks from {pdf_path.name}"
59
+
60
+ except Exception as e:
61
+ return f"Error ingesting PDF: {str(e)}"
62
+
63
+ def query_pdf(self, question: str, top_k: int = 5, source_filter: str = None) -> dict:
64
+ """Query the vector database and generate answer"""
65
+ try:
66
+ # Generate query embedding
67
+ query_embedding = embed_texts([question])[0]
68
+
69
+ # Search vector database
70
+ search_results = self.vector_store.search(query_embedding, top_k, source_filter)
71
+
72
+ if not search_results["contexts"]:
73
+ return {
74
+ "answer": "No relevant information found in the uploaded PDFs.",
75
+ "sources": [],
76
+ "contexts": []
77
+ }
78
+
79
+ # Create context for LLM
80
+ context_block = "\n\n".join(f"- {c}" for c in search_results["contexts"])
81
+ user_content = (
82
+ "Use the following context to answer the question.\n\n"
83
+ f"Context:\n{context_block}\n\n"
84
+ f"Question: {question}\n"
85
+ "Answer concisely using the context above."
86
+ )
87
+
88
+ # Generate answer using OpenAI
89
+ response = openai_client.chat.completions.create(
90
+ model="gpt-4",
91
+ messages=[
92
+ {"role": "system", "content": "You answer questions using only the provided context."},
93
+ {"role": "user", "content": user_content}
94
+ ],
95
+ max_tokens=1024,
96
+ temperature=0.2
97
+ )
98
+
99
+ answer = response.choices[0].message.content.strip()
100
+
101
+ return {
102
+ "answer": answer,
103
+ "sources": search_results["sources"],
104
+ "contexts": search_results["contexts"]
105
+ }
106
+
107
+ except Exception as e:
108
+ return {
109
+ "answer": f"Error processing query: {str(e)}",
110
+ "sources": [],
111
+ "contexts": []
112
+ }
113
+
114
+ def get_most_recent_pdf(self) -> str:
115
+ """Get the most recently uploaded PDF filename"""
116
+ if not self.uploads_dir.exists():
117
+ return None
118
+
119
+ pdf_files = list(self.uploads_dir.glob("*.pdf"))
120
+ if not pdf_files:
121
+ return None
122
+
123
+ most_recent = max(pdf_files, key=lambda p: p.stat().st_mtime)
124
+ return most_recent.stem
125
+
126
+ # Initialize the RAG processor
127
+ rag_processor = RAGProcessor()
128
+
129
+ def upload_and_ingest_pdf(file):
130
+ """Handle PDF upload and ingestion"""
131
+ if file is None:
132
+ return "Please upload a PDF file."
133
+
134
+ # Save the uploaded file
135
+ pdf_path = rag_processor.save_uploaded_pdf(file)
136
+
137
+ # Ingest the PDF
138
+ result = rag_processor.ingest_pdf(pdf_path)
139
+
140
+ return result
141
+
142
+ def ask_question(question, top_k, use_recent_pdf):
143
+ """Handle question asking"""
144
+ if not question.strip():
145
+ return "Please enter a question.", []
146
+
147
+ # Determine source filter
148
+ source_filter = None
149
+ if use_recent_pdf:
150
+ recent_pdf = rag_processor.get_most_recent_pdf()
151
+ if recent_pdf:
152
+ source_filter = recent_pdf
153
+ else:
154
+ return "No recent PDF found. Please upload a PDF first.", []
155
+
156
+ # Query the system
157
+ result = rag_processor.query_pdf(question, int(top_k), source_filter)
158
+
159
+ # Format sources for display
160
+ sources_text = "\n".join([f"• {source}" for source in result["sources"]]) if result["sources"] else "No sources found"
161
+
162
+ return result["answer"], sources_text
163
+
164
+ # Create Gradio interface
165
+ with gr.Blocks(title="RAG PDF Chat", theme=gr.themes.Soft()) as demo:
166
+ gr.Markdown("# 📄 RAG PDF Chat Application")
167
+ gr.Markdown("Upload PDFs and ask questions about their content using AI-powered retrieval.")
168
+
169
+ with gr.Tab("Upload PDF"):
170
+ gr.Markdown("### Upload a PDF Document")
171
+ pdf_upload = gr.File(
172
+ label="Choose a PDF file",
173
+ file_types=[".pdf"],
174
+ file_count="single"
175
+ )
176
+ upload_btn = gr.Button("Upload & Process PDF", variant="primary")
177
+ upload_status = gr.Textbox(
178
+ label="Upload Status",
179
+ interactive=False,
180
+ lines=2
181
+ )
182
+
183
+ upload_btn.click(
184
+ fn=upload_and_ingest_pdf,
185
+ inputs=[pdf_upload],
186
+ outputs=[upload_status]
187
+ )
188
+
189
+ with gr.Tab("Ask Questions"):
190
+ gr.Markdown("### Ask Questions About Your PDFs")
191
+
192
+ with gr.Row():
193
+ with gr.Column(scale=3):
194
+ question_input = gr.Textbox(
195
+ label="Your Question",
196
+ placeholder="What is the main topic of the document?",
197
+ lines=3
198
+ )
199
+
200
+ with gr.Row():
201
+ top_k_slider = gr.Slider(
202
+ minimum=1,
203
+ maximum=20,
204
+ value=5,
205
+ step=1,
206
+ label="Number of chunks to retrieve"
207
+ )
208
+ use_recent_checkbox = gr.Checkbox(
209
+ label="Search only in most recent PDF",
210
+ value=True
211
+ )
212
+
213
+ ask_btn = gr.Button("Ask Question", variant="primary")
214
+
215
+ with gr.Column(scale=2):
216
+ recent_pdf_info = gr.Markdown("")
217
+
218
+ with gr.Row():
219
+ with gr.Column():
220
+ answer_output = gr.Textbox(
221
+ label="Answer",
222
+ interactive=False,
223
+ lines=8
224
+ )
225
+
226
+ with gr.Column():
227
+ sources_output = gr.Textbox(
228
+ label="Sources",
229
+ interactive=False,
230
+ lines=8
231
+ )
232
+
233
+ # Update recent PDF info
234
+ def update_recent_pdf_info():
235
+ recent_pdf = rag_processor.get_most_recent_pdf()
236
+ if recent_pdf:
237
+ return f"🔍 **Most recent PDF:** {recent_pdf}"
238
+ else:
239
+ return "⚠️ **No PDFs uploaded yet.**"
240
+
241
+ # Update the recent PDF info when the demo loads
242
+ demo.load(
243
+ fn=update_recent_pdf_info,
244
+ outputs=[recent_pdf_info]
245
+ )
246
+
247
+ ask_btn.click(
248
+ fn=ask_question,
249
+ inputs=[question_input, top_k_slider, use_recent_checkbox],
250
+ outputs=[answer_output, sources_output]
251
+ )
252
+
253
+ if __name__ == "__main__":
254
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ fastapi>=0.116.1
3
+ llama-index-core>=0.14.0
4
+ llama-index-readers-file>=0.5.4
5
+ openai>=1.107.0
6
+ python-dotenv>=1.1.1
7
+ qdrant-client>=1.15.1
8
+ uvicorn>=0.35.0
vector_db.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qdrant_client import QdrantClient
2
+ from qdrant_client.models import VectorParams, Distance, PointStruct
3
+
4
+
5
+ class QdrantStorage:
6
+ def __init__(self, path="./qdrant_storage", collection="docs", dim=3072):
7
+ # Use local mode - this will use your existing data
8
+ self.client = QdrantClient(path=path)
9
+ self.collection = collection
10
+ if not self.client.collection_exists(self.collection):
11
+ self.client.create_collection(
12
+ collection_name=self.collection,
13
+ vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
14
+ )
15
+
16
+ def upsert(self, ids, vectors, payloads):
17
+ points = [PointStruct(id=ids[i], vector=vectors[i], payload=payloads[i]) for i in range(len(ids))]
18
+ self.client.upsert(self.collection, points=points)
19
+
20
+ def search(self, query_vector, top_k: int = 5, source_filter: str = None):
21
+ from qdrant_client.models import Filter, FieldCondition, MatchValue
22
+
23
+ # If source_filter is provided, only search within that source
24
+ if source_filter:
25
+ results = self.client.search(
26
+ collection_name=self.collection,
27
+ query_vector=query_vector,
28
+ query_filter=Filter(
29
+ must=[
30
+ FieldCondition(
31
+ key="source",
32
+ match=MatchValue(value=source_filter)
33
+ )
34
+ ]
35
+ ),
36
+ with_payload=True,
37
+ limit=top_k
38
+ )
39
+ else:
40
+ # Search across all sources
41
+ results = self.client.search(
42
+ collection_name=self.collection,
43
+ query_vector=query_vector,
44
+ with_payload=True,
45
+ limit=top_k
46
+ )
47
+
48
+ contexts = []
49
+ sources = set()
50
+
51
+ for r in results:
52
+ payload = getattr(r, "payload", None) or {}
53
+ text = payload.get("text", "")
54
+ source = payload.get("source", "")
55
+ if text:
56
+ contexts.append(text)
57
+ sources.add(source)
58
+
59
+ return {"contexts": contexts, "sources": list(sources)}