ChrisSacrumCor commited on
Commit
f1b5c29
Β·
verified Β·
1 Parent(s): ddeb653

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -0
app.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import lancedb
3
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4
+ from langgraph.graph import StateGraph, END
5
+ from langchain.tools import tool
6
+ from langgraph.prebuilt import create_react_agent
7
+ import os
8
+ import shutil
9
+ from typing import List, Dict, Optional, Annotated
10
+ from pydantic import BaseModel
11
+ import PyPDF2
12
+ from langgraph.graph.message import add_messages
13
+ import traceback
14
+
15
+ # Global setup
16
+ db = lancedb.connect("./global_vector_db")
17
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
18
+ llm = ChatOpenAI(model="gpt-3.5-turbo")
19
+
20
+ def init_documents_table():
21
+ table_name = "documents_v2" # Use new table name to avoid corrupted schema
22
+
23
+ try:
24
+ documents_table = db.open_table(table_name)
25
+ print(f"βœ… Opened existing table: {table_name}")
26
+ return documents_table, "embedding"
27
+
28
+ except Exception as e:
29
+ print(f"πŸ”„ Creating new table {table_name}... ({e})")
30
+
31
+ # Create a clean table with proper vector schema
32
+ sample_doc = [{
33
+ "text": "sample initialization text",
34
+ "embedding": embeddings.embed_query("sample"),
35
+ "source": "init",
36
+ "doc_id": "init",
37
+ "chunk_id": 0,
38
+ "summary": "initialization"
39
+ }]
40
+
41
+ documents_table = db.create_table(table_name, sample_doc)
42
+ print(f"βœ… Created new table: {table_name}")
43
+ return documents_table, "embedding"
44
+
45
+ documents_table, vector_column_name = init_documents_table()
46
+
47
+ def extract_text_with_pypdf2(file_path: str) -> str:
48
+ """Extract text using PyPDF2 as primary method"""
49
+ try:
50
+ print(f"πŸ“– Extracting text with PyPDF2...")
51
+ text = ""
52
+ with open(file_path, 'rb') as file:
53
+ pdf_reader = PyPDF2.PdfReader(file)
54
+ print(f"πŸ“„ Found {len(pdf_reader.pages)} pages")
55
+
56
+ for page_num, page in enumerate(pdf_reader.pages):
57
+ try:
58
+ page_text = page.extract_text()
59
+ if page_text and page_text.strip():
60
+ text += f"\n--- Page {page_num + 1} ---\n{page_text.strip()}\n"
61
+ print(f"βœ… Extracted {len(page_text)} chars from page {page_num + 1}")
62
+ else:
63
+ print(f"⚠️ No text on page {page_num + 1}")
64
+ except Exception as page_error:
65
+ print(f"❌ Error extracting page {page_num + 1}: {page_error}")
66
+ continue
67
+
68
+ return text.strip()
69
+ except Exception as e:
70
+ print(f"❌ PyPDF2 extraction failed: {e}")
71
+ return ""
72
+
73
+ def extract_text_with_docling(file_path: str) -> str:
74
+ """Try Docling extraction with better error handling"""
75
+ try:
76
+ from docling import DocumentConverter
77
+ converter = DocumentConverter()
78
+
79
+ print(f"πŸ“„ Trying Docling conversion...")
80
+ result = converter.convert(file_path)
81
+
82
+ text = ""
83
+
84
+ # Debug the result structure
85
+ print(f"πŸ” Docling result type: {type(result)}")
86
+ print(f"πŸ” Docling result attributes: {dir(result)}")
87
+
88
+ # Try different ways to access the content
89
+ if hasattr(result, 'document'):
90
+ doc = result.document
91
+ print(f"πŸ” Document type: {type(doc)}")
92
+ print(f"πŸ” Document attributes: {dir(doc)}")
93
+
94
+ if hasattr(doc, 'pages'):
95
+ print(f"πŸ” Pages type: {type(doc.pages)}")
96
+ print(f"πŸ” Number of pages: {len(doc.pages) if hasattr(doc.pages, '__len__') else 'unknown'}")
97
+
98
+ # Check what pages actually contains
99
+ if hasattr(doc.pages, '__iter__'):
100
+ for i, page in enumerate(doc.pages):
101
+ print(f"πŸ” Page {i} type: {type(page)}")
102
+ if hasattr(page, 'text'):
103
+ page_text = page.text
104
+ if page_text and len(str(page_text).strip()) > 50:
105
+ text += f"\n--- Page {i + 1} ---\n{page_text}\n"
106
+ elif hasattr(page, 'content'):
107
+ page_text = str(page.content)
108
+ if page_text and len(page_text.strip()) > 50:
109
+ text += f"\n--- Page {i + 1} ---\n{page_text}\n"
110
+ else:
111
+ print(f"⚠️ Page {i} has no text/content attribute")
112
+
113
+ elif hasattr(doc, 'text'):
114
+ text = doc.text
115
+ elif hasattr(doc, 'content'):
116
+ text = str(doc.content)
117
+
118
+ elif hasattr(result, 'text'):
119
+ text = result.text
120
+ elif hasattr(result, 'content'):
121
+ text = str(result.content)
122
+
123
+ return text.strip()
124
+
125
+ except Exception as e:
126
+ print(f"❌ Docling extraction failed: {e}")
127
+ traceback.print_exc()
128
+ return ""
129
+
130
+ @tool
131
+ def add_document_to_knowledge_base(file_path: str) -> str:
132
+ """Process and add a document to the global knowledge base."""
133
+ try:
134
+ print(f"πŸ” Processing file: {file_path}")
135
+
136
+ if not os.path.exists(file_path):
137
+ return f"❌ File not found: {file_path}"
138
+
139
+ doc_id = os.path.basename(file_path)
140
+
141
+ # Try multiple extraction methods
142
+ extracted_text = ""
143
+
144
+ # Method 1: Try PyPDF2 first (more reliable)
145
+ if file_path.lower().endswith('.pdf'):
146
+ extracted_text = extract_text_with_pypdf2(file_path)
147
+
148
+ # Method 2: Try Docling if PyPDF2 failed
149
+ if not extracted_text:
150
+ print("πŸ”„ PyPDF2 failed, trying Docling...")
151
+ extracted_text = extract_text_with_docling(file_path)
152
+
153
+ # Method 3: Simple file reading for text files
154
+ if not extracted_text and file_path.lower().endswith(('.txt', '.md')):
155
+ try:
156
+ with open(file_path, 'r', encoding='utf-8') as f:
157
+ extracted_text = f.read()
158
+ except Exception as e:
159
+ print(f"❌ Text file reading failed: {e}")
160
+
161
+ if not extracted_text or len(extracted_text.strip()) < 50:
162
+ return f"❌ Could not extract meaningful text from {doc_id}. File may be image-based PDF or corrupted."
163
+
164
+ print(f"πŸ“ Successfully extracted {len(extracted_text)} characters")
165
+
166
+ # Create summary
167
+ summary_text = extracted_text[:3000] # Limit for API
168
+ summary_prompt = f"""Summarize this document in 2-3 clear sentences, focusing on the main topics and key points:
169
+
170
+ {summary_text}"""
171
+
172
+ try:
173
+ summary_response = llm.invoke(summary_prompt)
174
+ doc_summary = summary_response.content.strip()
175
+ except Exception as e:
176
+ print(f"⚠️ Summary generation failed: {e}")
177
+ doc_summary = f"Document containing {len(extracted_text)} characters of text"
178
+
179
+ print(f"βœ… Summary: {doc_summary}")
180
+
181
+ # Split into chunks (simple approach)
182
+ chunk_size = 1000
183
+ overlap = 100
184
+ text_chunks = []
185
+
186
+ for i in range(0, len(extracted_text), chunk_size - overlap):
187
+ chunk = extracted_text[i:i + chunk_size].strip()
188
+ if len(chunk) > 100: # Skip tiny chunks
189
+ text_chunks.append(chunk)
190
+
191
+ print(f"πŸ”„ Creating {len(text_chunks)} chunks and embeddings...")
192
+
193
+ # Create embeddings and prepare data
194
+ chunks_data = []
195
+ for i, chunk_text in enumerate(text_chunks):
196
+ try:
197
+ embedding = embeddings.embed_query(chunk_text)
198
+
199
+ chunk_data = {
200
+ "text": chunk_text,
201
+ "embedding": embedding, # Always use 'embedding' as column name
202
+ "source": doc_id,
203
+ "doc_id": doc_id,
204
+ "chunk_id": i,
205
+ "summary": doc_summary
206
+ }
207
+ chunks_data.append(chunk_data)
208
+
209
+ except Exception as e:
210
+ print(f"⚠️ Failed to embed chunk {i}: {e}")
211
+ continue
212
+
213
+ if not chunks_data:
214
+ return f"❌ Failed to create any valid chunks from {doc_id}"
215
+
216
+ # Add to LanceDB
217
+ print(f"πŸ’Ύ Adding {len(chunks_data)} chunks to LanceDB...")
218
+ documents_table.add(chunks_data)
219
+
220
+ return f"""βœ… Successfully processed {doc_id}:
221
+ - Extracted: {len(extracted_text)} characters
222
+ - Created: {len(chunks_data)} chunks
223
+ - Added to knowledge base
224
+ - Summary: {doc_summary}"""
225
+
226
+ except Exception as e:
227
+ print(f"❌ Error processing document: {str(e)}")
228
+ traceback.print_exc()
229
+ return f"❌ Error processing document: {str(e)}"
230
+
231
+ @tool
232
+ def search_text_directly(query: str, limit: int = 3) -> str:
233
+ """Search document text directly using keyword matching (fallback method)."""
234
+ try:
235
+ print(f"πŸ” Direct text search for: {query}")
236
+
237
+ # Get all documents and search by text matching
238
+ all_docs = documents_table.to_pandas()
239
+
240
+ if all_docs.empty:
241
+ return "No documents in knowledge base."
242
+
243
+ # Simple keyword matching
244
+ query_lower = query.lower()
245
+ matches = []
246
+
247
+ for _, doc in all_docs.iterrows():
248
+ text_lower = doc['text'].lower()
249
+ if any(word in text_lower for word in query_lower.split()):
250
+ matches.append(doc)
251
+
252
+ if not matches:
253
+ return f"No text matches found for '{query}'"
254
+
255
+ # Sort by relevance (count of matching words)
256
+ def relevance_score(text):
257
+ return sum(1 for word in query_lower.split() if word in text.lower())
258
+
259
+ matches.sort(key=lambda x: relevance_score(x['text']), reverse=True)
260
+ matches = matches[:limit]
261
+
262
+ print(f"πŸ“š Found {len(matches)} text matches")
263
+
264
+ # Format results
265
+ formatted_results = []
266
+ for i, doc in enumerate(matches, 1):
267
+ text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
268
+ formatted_results.append(
269
+ f"πŸ“„ **Match {i}** (from {doc['source']}):\n{text_preview}\n"
270
+ )
271
+
272
+ return "\n" + "="*60 + "\n".join(formatted_results)
273
+
274
+ except Exception as e:
275
+ print(f"❌ Error in direct text search: {str(e)}")
276
+ return f"❌ Error in direct text search: {str(e)}"
277
+ """Search the global knowledge base for relevant information."""
278
+ try:
279
+ print(f"πŸ” Searching knowledge base for: {query}")
280
+
281
+ # Create query embedding
282
+ query_vector = embeddings.embed_query(query)
283
+
284
+ # Simple search without specifying vector column (let LanceDB auto-detect)
285
+ results = documents_table.search(query_vector).limit(limit).to_list()
286
+
287
+ if not results:
288
+ return "No relevant documents found in knowledge base."
289
+
290
+ print(f"πŸ“š Found {len(results)} relevant chunks")
291
+
292
+ # Format results nicely
293
+ formatted_results = []
294
+ for i, doc in enumerate(results, 1):
295
+ text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
296
+ formatted_results.append(
297
+ f"πŸ“„ **Result {i}** (from {doc['source']}):\n{text_preview}\n"
298
+ )
299
+
300
+ return "\n" + "="*60 + "\n".join(formatted_results)
301
+
302
+ except Exception as e:
303
+ print(f"❌ Error searching knowledge base: {str(e)}")
304
+ traceback.print_exc()
305
+ return f"❌ Error searching knowledge base: {str(e)}"
306
+
307
+ # State definition using modern LangGraph patterns
308
+ class AgentState(BaseModel):
309
+ messages: Annotated[list, add_messages]
310
+ user_input: str = ""
311
+ uploaded_file_path: Optional[str] = None
312
+
313
+ def agent_node(state: AgentState):
314
+ """Agent node using create_react_agent"""
315
+
316
+ tools = [search_knowledge_base, add_document_to_knowledge_base, search_text_directly]
317
+
318
+ # Create the agent
319
+ agent = create_react_agent(llm, tools)
320
+
321
+ # Prepare the message
322
+ user_message = state.user_input
323
+ if state.uploaded_file_path:
324
+ user_message = f"I uploaded a file: {state.uploaded_file_path}. Please process it into the knowledge base and tell me about its contents. Then answer: {user_message}"
325
+
326
+ # Invoke the agent
327
+ try:
328
+ result = agent.invoke({
329
+ "messages": [{"role": "user", "content": user_message}]
330
+ })
331
+
332
+ return {
333
+ "messages": result["messages"],
334
+ "user_input": state.user_input,
335
+ "uploaded_file_path": state.uploaded_file_path
336
+ }
337
+
338
+ except Exception as e:
339
+ error_msg = f"❌ Agent error: {str(e)}"
340
+ print(error_msg)
341
+ traceback.print_exc()
342
+ return {
343
+ "messages": state.messages + [{"role": "assistant", "content": error_msg}],
344
+ "user_input": state.user_input,
345
+ "uploaded_file_path": state.uploaded_file_path
346
+ }
347
+
348
+ # Build workflow
349
+ workflow = StateGraph(AgentState)
350
+ workflow.add_node("agent", agent_node)
351
+ workflow.set_entry_point("agent")
352
+ workflow.add_edge("agent", END)
353
+ app = workflow.compile()
354
+
355
+ def process_chat(message, history, uploaded_file):
356
+ """Process chat with file upload handling"""
357
+
358
+ print(f"πŸ“₯ Message: {message}")
359
+ print(f"πŸ“ File: {uploaded_file}")
360
+
361
+ # Handle file upload
362
+ permanent_file_path = None
363
+ if uploaded_file is not None:
364
+ upload_dir = "./uploaded_docs"
365
+ os.makedirs(upload_dir, exist_ok=True)
366
+
367
+ filename = os.path.basename(uploaded_file.name)
368
+ permanent_file_path = os.path.join(upload_dir, filename)
369
+
370
+ try:
371
+ shutil.copy2(uploaded_file.name, permanent_file_path)
372
+ print(f"πŸ“‹ Copied to: {permanent_file_path}")
373
+ except Exception as e:
374
+ print(f"❌ File copy failed: {e}")
375
+ permanent_file_path = None
376
+
377
+ # Create state and run agent
378
+ state = AgentState(
379
+ messages=[],
380
+ user_input=message,
381
+ uploaded_file_path=permanent_file_path
382
+ )
383
+
384
+ try:
385
+ result = app.invoke(state)
386
+ # Get the last assistant message
387
+ assistant_messages = [msg for msg in result['messages']
388
+ if hasattr(msg, 'type') and msg.type == 'ai' or
389
+ (isinstance(msg, dict) and msg.get('role') == 'assistant')]
390
+
391
+ if assistant_messages:
392
+ response = assistant_messages[-1].content if hasattr(assistant_messages[-1], 'content') else str(assistant_messages[-1])
393
+ else:
394
+ # Fallback: get the last message regardless of type
395
+ last_msg = result['messages'][-1] if result['messages'] else None
396
+ if last_msg:
397
+ response = last_msg.content if hasattr(last_msg, 'content') else str(last_msg)
398
+ else:
399
+ response = "No response generated"
400
+
401
+ except Exception as e:
402
+ response = f"❌ Error: {str(e)}"
403
+ print(f"❌ App error: {e}")
404
+ traceback.print_exc()
405
+
406
+ history.append([message, response])
407
+ return history, ""
408
+
409
+ # Gradio interface
410
+ with gr.Blocks(title="Knowledge Base Agent") as demo:
411
+ gr.Markdown("# πŸ“š Knowledge Base Agent")
412
+ gr.Markdown("Upload PDF documents and ask questions! Uses PyPDF2 as primary extraction method.")
413
+
414
+ chatbot = gr.Chatbot(height=500)
415
+
416
+ with gr.Row():
417
+ msg = gr.Textbox(
418
+ label="Message",
419
+ placeholder="Upload a document or ask a question...",
420
+ scale=4
421
+ )
422
+ upload = gr.File(
423
+ label="Upload",
424
+ file_types=[".pdf", ".docx", ".txt", ".md"],
425
+ scale=1
426
+ )
427
+
428
+ msg.submit(
429
+ process_chat,
430
+ inputs=[msg, chatbot, upload],
431
+ outputs=[chatbot, msg]
432
+ )
433
+
434
+ if __name__ == "__main__":
435
+ demo.launch(debug=True)