ChrisSacrumCor commited on
Commit
c0f1437
·
verified ·
1 Parent(s): 069a9d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -396
app.py CHANGED
@@ -1,435 +1,255 @@
1
- import gradio as gr
2
- import lancedb
3
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
4
- from langgraph.graph import StateGraph, END
5
- from langchain.tools import tool
6
- from langgraph.prebuilt import create_react_agent
7
  import os
8
- import shutil
9
- from typing import List, Dict, Optional, Annotated
10
- from pydantic import BaseModel
 
 
 
 
 
 
 
 
 
 
11
  import PyPDF2
12
- from langgraph.graph.message import add_messages
13
- import traceback
14
 
15
- # Global setup
16
- db = lancedb.connect("./global_vector_db")
17
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
18
- llm = ChatOpenAI(model="gpt-3.5-turbo")
 
 
 
 
 
 
 
 
19
 
20
- def init_documents_table():
21
- table_name = "documents_v2" # Use new table name to avoid corrupted schema
 
 
 
 
 
 
 
 
 
 
22
 
23
- try:
24
- documents_table = db.open_table(table_name)
25
- print(f"✅ Opened existing table: {table_name}")
26
- return documents_table, "embedding"
27
-
28
- except Exception as e:
29
- print(f"🔄 Creating new table {table_name}... ({e})")
30
-
31
- # Create a clean table with proper vector schema
32
- sample_doc = [{
33
- "text": "sample initialization text",
34
- "embedding": embeddings.embed_query("sample"),
35
- "source": "init",
36
- "doc_id": "init",
37
- "chunk_id": 0,
38
- "summary": "initialization"
39
- }]
40
-
41
- documents_table = db.create_table(table_name, sample_doc)
42
- print(f"✅ Created new table: {table_name}")
43
- return documents_table, "embedding"
44
 
45
- documents_table, vector_column_name = init_documents_table()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- def extract_text_with_pypdf2(file_path: str) -> str:
48
- """Extract text using PyPDF2 as primary method"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
- print(f"📖 Extracting text with PyPDF2...")
51
- text = ""
52
- with open(file_path, 'rb') as file:
53
- pdf_reader = PyPDF2.PdfReader(file)
54
- print(f"📄 Found {len(pdf_reader.pages)} pages")
55
-
56
- for page_num, page in enumerate(pdf_reader.pages):
57
- try:
58
- page_text = page.extract_text()
59
- if page_text and page_text.strip():
60
- text += f"\n--- Page {page_num + 1} ---\n{page_text.strip()}\n"
61
- print(f"✅ Extracted {len(page_text)} chars from page {page_num + 1}")
62
- else:
63
- print(f"⚠️ No text on page {page_num + 1}")
64
- except Exception as page_error:
65
- print(f"❌ Error extracting page {page_num + 1}: {page_error}")
66
- continue
67
-
68
- return text.strip()
69
  except Exception as e:
70
- print(f"❌ PyPDF2 extraction failed: {e}")
71
- return ""
 
 
72
 
73
- def extract_text_with_docling(file_path: str) -> str:
74
- """Try Docling extraction with better error handling"""
 
 
75
  try:
76
- from docling import DocumentConverter
77
- converter = DocumentConverter()
78
-
79
- print(f"📄 Trying Docling conversion...")
80
- result = converter.convert(file_path)
81
-
82
- text = ""
83
-
84
- # Debug the result structure
85
- print(f"🔍 Docling result type: {type(result)}")
86
- print(f"🔍 Docling result attributes: {dir(result)}")
87
-
88
- # Try different ways to access the content
89
- if hasattr(result, 'document'):
90
- doc = result.document
91
- print(f"🔍 Document type: {type(doc)}")
92
- print(f"🔍 Document attributes: {dir(doc)}")
93
-
94
- if hasattr(doc, 'pages'):
95
- print(f"🔍 Pages type: {type(doc.pages)}")
96
- print(f"🔍 Number of pages: {len(doc.pages) if hasattr(doc.pages, '__len__') else 'unknown'}")
97
-
98
- # Check what pages actually contains
99
- if hasattr(doc.pages, '__iter__'):
100
- for i, page in enumerate(doc.pages):
101
- print(f"🔍 Page {i} type: {type(page)}")
102
- if hasattr(page, 'text'):
103
- page_text = page.text
104
- if page_text and len(str(page_text).strip()) > 50:
105
- text += f"\n--- Page {i + 1} ---\n{page_text}\n"
106
- elif hasattr(page, 'content'):
107
- page_text = str(page.content)
108
- if page_text and len(page_text.strip()) > 50:
109
- text += f"\n--- Page {i + 1} ---\n{page_text}\n"
110
- else:
111
- print(f"⚠️ Page {i} has no text/content attribute")
112
-
113
- elif hasattr(doc, 'text'):
114
- text = doc.text
115
- elif hasattr(doc, 'content'):
116
- text = str(doc.content)
117
-
118
- elif hasattr(result, 'text'):
119
- text = result.text
120
- elif hasattr(result, 'content'):
121
- text = str(result.content)
122
-
123
- return text.strip()
124
-
125
  except Exception as e:
126
- print(f"❌ Docling extraction failed: {e}")
127
- traceback.print_exc()
128
- return ""
129
 
130
- @tool
131
- def add_document_to_knowledge_base(file_path: str) -> str:
132
- """Process and add a document to the global knowledge base."""
133
- try:
134
- print(f"🔍 Processing file: {file_path}")
135
-
136
- if not os.path.exists(file_path):
137
- return f"❌ File not found: {file_path}"
138
-
139
- doc_id = os.path.basename(file_path)
140
-
141
- # Try multiple extraction methods
142
- extracted_text = ""
143
-
144
- # Method 1: Try PyPDF2 first (more reliable)
145
- if file_path.lower().endswith('.pdf'):
146
- extracted_text = extract_text_with_pypdf2(file_path)
147
-
148
- # Method 2: Try Docling if PyPDF2 failed
149
- if not extracted_text:
150
- print("🔄 PyPDF2 failed, trying Docling...")
151
- extracted_text = extract_text_with_docling(file_path)
152
-
153
- # Method 3: Simple file reading for text files
154
- if not extracted_text and file_path.lower().endswith(('.txt', '.md')):
155
- try:
156
- with open(file_path, 'r', encoding='utf-8') as f:
157
- extracted_text = f.read()
158
- except Exception as e:
159
- print(f"❌ Text file reading failed: {e}")
160
-
161
- if not extracted_text or len(extracted_text.strip()) < 50:
162
- return f"❌ Could not extract meaningful text from {doc_id}. File may be image-based PDF or corrupted."
163
-
164
- print(f"📝 Successfully extracted {len(extracted_text)} characters")
165
-
166
- # Create summary
167
- summary_text = extracted_text[:3000] # Limit for API
168
- summary_prompt = f"""Summarize this document in 2-3 clear sentences, focusing on the main topics and key points:
169
 
170
- {summary_text}"""
171
-
172
- try:
173
- summary_response = llm.invoke(summary_prompt)
174
- doc_summary = summary_response.content.strip()
175
- except Exception as e:
176
- print(f"⚠️ Summary generation failed: {e}")
177
- doc_summary = f"Document containing {len(extracted_text)} characters of text"
178
-
179
- print(f"✅ Summary: {doc_summary}")
180
-
181
- # Split into chunks (simple approach)
182
- chunk_size = 1000
183
- overlap = 100
184
- text_chunks = []
185
-
186
- for i in range(0, len(extracted_text), chunk_size - overlap):
187
- chunk = extracted_text[i:i + chunk_size].strip()
188
- if len(chunk) > 100: # Skip tiny chunks
189
- text_chunks.append(chunk)
190
-
191
- print(f"🔄 Creating {len(text_chunks)} chunks and embeddings...")
192
 
193
- # Create embeddings and prepare data
194
- chunks_data = []
195
- for i, chunk_text in enumerate(text_chunks):
196
  try:
197
- embedding = embeddings.embed_query(chunk_text)
198
-
199
- chunk_data = {
200
- "text": chunk_text,
201
- "embedding": embedding, # Always use 'embedding' as column name
202
- "source": doc_id,
203
- "doc_id": doc_id,
204
- "chunk_id": i,
205
- "summary": doc_summary
206
- }
207
- chunks_data.append(chunk_data)
208
-
209
  except Exception as e:
210
- print(f"⚠️ Failed to embed chunk {i}: {e}")
211
  continue
212
 
213
- if not chunks_data:
214
- return f" Failed to create any valid chunks from {doc_id}"
215
-
216
- # Add to LanceDB
217
- print(f"💾 Adding {len(chunks_data)} chunks to LanceDB...")
218
- documents_table.add(chunks_data)
219
-
220
- return f"""✅ Successfully processed {doc_id}:
221
- - Extracted: {len(extracted_text)} characters
222
- - Created: {len(chunks_data)} chunks
223
- - Added to knowledge base
224
- - Summary: {doc_summary}"""
225
-
226
- except Exception as e:
227
- print(f"❌ Error processing document: {str(e)}")
228
- traceback.print_exc()
229
- return f"❌ Error processing document: {str(e)}"
230
-
231
- @tool
232
- def search_text_directly(query: str, limit: int = 3) -> str:
233
- """Search document text directly using keyword matching (fallback method)."""
234
- try:
235
- print(f"🔍 Direct text search for: {query}")
236
-
237
- # Get all documents and search by text matching
238
- all_docs = documents_table.to_pandas()
239
-
240
- if all_docs.empty:
241
- return "No documents in knowledge base."
242
-
243
- # Simple keyword matching
244
- query_lower = query.lower()
245
- matches = []
246
-
247
- for _, doc in all_docs.iterrows():
248
- text_lower = doc['text'].lower()
249
- if any(word in text_lower for word in query_lower.split()):
250
- matches.append(doc)
251
-
252
- if not matches:
253
- return f"No text matches found for '{query}'"
254
-
255
- # Sort by relevance (count of matching words)
256
- def relevance_score(text):
257
- return sum(1 for word in query_lower.split() if word in text.lower())
258
 
259
- matches.sort(key=lambda x: relevance_score(x['text']), reverse=True)
260
- matches = matches[:limit]
261
 
262
- print(f"📚 Found {len(matches)} text matches")
263
-
264
- # Format results
265
- formatted_results = []
266
- for i, doc in enumerate(matches, 1):
267
- text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
268
- formatted_results.append(
269
- f"📄 **Match {i}** (from {doc['source']}):\n{text_preview}\n"
270
- )
271
-
272
- return "\n" + "="*60 + "\n".join(formatted_results)
273
-
274
  except Exception as e:
275
- print(f"Error in direct text search: {str(e)}")
276
- return f"❌ Error in direct text search: {str(e)}"
277
- """Search the global knowledge base for relevant information."""
278
- try:
279
- print(f"🔍 Searching knowledge base for: {query}")
280
-
281
- # Create query embedding
282
- query_vector = embeddings.embed_query(query)
283
-
284
- # Simple search without specifying vector column (let LanceDB auto-detect)
285
- results = documents_table.search(query_vector).limit(limit).to_list()
286
-
287
- if not results:
288
- return "No relevant documents found in knowledge base."
289
-
290
- print(f"📚 Found {len(results)} relevant chunks")
291
-
292
- # Format results nicely
293
- formatted_results = []
294
- for i, doc in enumerate(results, 1):
295
- text_preview = doc['text'][:500] + "..." if len(doc['text']) > 500 else doc['text']
296
- formatted_results.append(
297
- f"📄 **Result {i}** (from {doc['source']}):\n{text_preview}\n"
298
- )
299
-
300
- return "\n" + "="*60 + "\n".join(formatted_results)
301
-
302
- except Exception as e:
303
- print(f"❌ Error searching knowledge base: {str(e)}")
304
- traceback.print_exc()
305
- return f"❌ Error searching knowledge base: {str(e)}"
306
-
307
- # State definition using modern LangGraph patterns
308
- class AgentState(BaseModel):
309
- messages: Annotated[list, add_messages]
310
- user_input: str = ""
311
- uploaded_file_path: Optional[str] = None
312
 
313
- def agent_node(state: AgentState):
314
- """Agent node using create_react_agent"""
 
315
 
316
- tools = [search_knowledge_base, add_document_to_knowledge_base, search_text_directly]
 
 
 
317
 
318
- # Create the agent
319
- agent = create_react_agent(llm, tools)
 
 
 
 
320
 
321
- # Prepare the message
322
- user_message = state.user_input
323
- if state.uploaded_file_path:
324
- user_message = f"I uploaded a file: {state.uploaded_file_path}. Please process it into the knowledge base and tell me about its contents. Then answer: {user_message}"
325
-
326
- # Invoke the agent
327
- try:
328
- result = agent.invoke({
329
- "messages": [{"role": "user", "content": user_message}]
330
- })
331
 
332
- return {
333
- "messages": result["messages"],
334
- "user_input": state.user_input,
335
- "uploaded_file_path": state.uploaded_file_path
336
- }
337
-
338
- except Exception as e:
339
- error_msg = f"❌ Agent error: {str(e)}"
340
- print(error_msg)
341
- traceback.print_exc()
342
- return {
343
- "messages": state.messages + [{"role": "assistant", "content": error_msg}],
344
- "user_input": state.user_input,
345
- "uploaded_file_path": state.uploaded_file_path
346
- }
347
-
348
- # Build workflow
349
- workflow = StateGraph(AgentState)
350
- workflow.add_node("agent", agent_node)
351
- workflow.set_entry_point("agent")
352
- workflow.add_edge("agent", END)
353
- app = workflow.compile()
354
-
355
- def process_chat(message, history, uploaded_file):
356
- """Process chat with file upload handling"""
357
-
358
- print(f"📥 Message: {message}")
359
- print(f"📁 File: {uploaded_file}")
360
-
361
- # Handle file upload
362
- permanent_file_path = None
363
- if uploaded_file is not None:
364
- upload_dir = "./uploaded_docs"
365
- os.makedirs(upload_dir, exist_ok=True)
366
-
367
- filename = os.path.basename(uploaded_file.name)
368
- permanent_file_path = os.path.join(upload_dir, filename)
369
-
370
- try:
371
- shutil.copy2(uploaded_file.name, permanent_file_path)
372
- print(f"📋 Copied to: {permanent_file_path}")
373
- except Exception as e:
374
- print(f"❌ File copy failed: {e}")
375
- permanent_file_path = None
376
-
377
- # Create state and run agent
378
- state = AgentState(
379
- messages=[],
380
- user_input=message,
381
- uploaded_file_path=permanent_file_path
382
- )
383
-
384
- try:
385
- result = app.invoke(state)
386
- # Get the last assistant message
387
- assistant_messages = [msg for msg in result['messages']
388
- if hasattr(msg, 'type') and msg.type == 'ai' or
389
- (isinstance(msg, dict) and msg.get('role') == 'assistant')]
390
 
391
- if assistant_messages:
392
- response = assistant_messages[-1].content if hasattr(assistant_messages[-1], 'content') else str(assistant_messages[-1])
393
- else:
394
- # Fallback: get the last message regardless of type
395
- last_msg = result['messages'][-1] if result['messages'] else None
396
- if last_msg:
397
- response = last_msg.content if hasattr(last_msg, 'content') else str(last_msg)
398
- else:
399
- response = "No response generated"
400
 
401
- except Exception as e:
402
- response = f"❌ Error: {str(e)}"
403
- print(f"❌ App error: {e}")
404
- traceback.print_exc()
 
 
 
 
 
 
 
405
 
406
- history.append([message, response])
407
- return history, ""
408
 
409
- # Gradio interface
410
- with gr.Blocks(title="Knowledge Base Agent") as demo:
411
- gr.Markdown("# 📚 Knowledge Base Agent")
412
- gr.Markdown("Upload PDF documents and ask questions! Uses PyPDF2 as primary extraction method.")
413
-
414
- chatbot = gr.Chatbot(height=500)
415
 
416
  with gr.Row():
417
- msg = gr.Textbox(
418
- label="Message",
419
- placeholder="Upload a document or ask a question...",
420
- scale=4
421
- )
422
- upload = gr.File(
423
- label="Upload",
424
- file_types=[".pdf", ".docx", ".txt", ".md"],
425
- scale=1
426
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  msg.submit(
429
- process_chat,
430
- inputs=[msg, chatbot, upload],
431
- outputs=[chatbot, msg]
432
  )
 
 
433
 
434
  if __name__ == "__main__":
435
- demo.launch(debug=True)
 
 
 
 
 
 
 
1
  import os
2
+ import pathlib
3
+ from dotenv import load_dotenv
4
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
+ from langchain_chroma import Chroma
8
+ from langchain.schema import Document
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.chains.base import Chain
11
+ from langchain.memory import ConversationBufferMemory
12
+ import gradio as gr
13
+ from langchain_core.retrievers import BaseRetriever
14
+ import re
15
  import PyPDF2
 
 
16
 
17
+ # Load environment variables and constants
18
+ CHUNK_SIZE = 1000
19
+ CHUNK_OVERLAP = 200
20
+ load_dotenv()
21
+
22
+ api_key = os.environ.get("OPENAI_API_KEY")
23
+ if not api_key:
24
+ raise ValueError("OPENAI_API_KEY environment variable is not set")
25
+
26
+ # Document Loader
27
+ class DocumentLoaderException(Exception):
28
+ pass
29
 
30
+ class DocumentLoader(object):
31
+ supported_files = {
32
+ "pdf": PyPDFLoader,
33
+ "txt": TextLoader,
34
+ }
35
+
36
+ def load_documents(file_path: str) -> list[Document]:
37
+ """Load documents from file path"""
38
+ ext = pathlib.Path(file_path).suffix.lower().lstrip('.')
39
+ loader_class = DocumentLoader.supported_files.get(ext)
40
+ if not loader_class:
41
+ raise DocumentLoaderException(f"Unsupported file type: {ext}. Please provide a .txt or .pdf file")
42
 
43
+ loader = loader_class(file_path)
44
+ docs = loader.load()
45
+ return docs
46
+
47
+ # Embeddings and vector storage
48
+ def configure_retriever(docs: list[Document]) -> BaseRetriever:
49
+ """Configure retriever for document search"""
50
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
51
+ chunks = text_splitter.split_documents(docs)
52
+
53
+ embeddings = OpenAIEmbeddings()
54
+ vectorstore = Chroma.from_documents(
55
+ documents=chunks,
56
+ embedding=embeddings,
57
+ persist_directory="chroma_db"
58
+ )
59
+
60
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 6, "fetch_k":20})
61
+ return retriever
 
 
62
 
63
+ # Chatbot
64
+ def configure_chatbot(retriever: BaseRetriever) -> Chain:
65
+ """Configure the conversational chatbot"""
66
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
67
+ model = ChatOpenAI(
68
+ model="gpt-4o-mini",
69
+ temperature=2,
70
+ streaming=True,
71
+ max_tokens=15000
72
+ )
73
+
74
+ return ConversationalRetrievalChain.from_llm(
75
+ llm=model,
76
+ retriever=retriever,
77
+ memory=memory,
78
+ verbose=True
79
+ )
80
 
81
+ # Gradio app functions
82
+ def process_files(files):
83
+ """Process uploaded files and create chatbot"""
84
+ if not files:
85
+ return None
86
+
87
+ docs = []
88
+ for file in files:
89
+ if os.path.exists(file.name):
90
+ docs.extend(load_documents(file.name))
91
+
92
+ if not docs:
93
+ raise DocumentLoaderException("No documents were successfully loaded")
94
+
95
+ retriever = configure_retriever(docs)
96
+ return configure_chatbot(retriever)
97
+
98
+ def respond(message, chat_history, qa_chain):
99
+ """Handle chat responses"""
100
+ if not qa_chain:
101
+ chat_history.append({"role": "user", "content": message})
102
+ chat_history.append({"role": "assistant", "content": "Please upload documents first."})
103
+ return "", chat_history
104
+
105
  try:
106
+ response = qa_chain.invoke({"question": message})
107
+ chat_history.append({"role": "user", "content": message})
108
+ chat_history.append({"role": "assistant", "content": response["answer"]})
109
+ return "", chat_history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  except Exception as e:
111
+ error_message = f"Error: {str(e)}"
112
+ chat_history.append({"role": "user", "content": message})
113
+ chat_history.append({"role": "assistant", "content": error_message})
114
+ return "", chat_history
115
 
116
+ def process_files_with_status(files):
117
+ """Process files and return status"""
118
+ if not files:
119
+ return None, "Please upload at least one document."
120
  try:
121
+ result = process_files(files)
122
+ return result, "Documents processed successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  except Exception as e:
124
+ return None, f"Error: {str(e)}"
 
 
125
 
126
+ def clean_text(text):
127
+ # Remove special characters and extra whitespace
128
+ text = re.sub(r'[^\w\s.,!?-]', ' ', text)
129
+ # Remove multiple spaces
130
+ text = re.sub(r'\s+', ' ', text)
131
+ # Remove empty lines
132
+ text = re.sub(r'\n\s*\n', '\n', text)
133
+ # Remove lines that are just numbers or very short
134
+ text = '\n'.join(line for line in text.split('\n')
135
+ if len(line.strip()) > 3 and not line.strip().isdigit())
136
+ # Remove common metadata patterns
137
+ text = re.sub(r'File size.*?MB', '', text)
138
+ text = re.sub(r'Format:.*?Edition', '', text)
139
+ text = re.sub(r'\d+\.\d+\s+out of \d+ stars', '', text)
140
+ text = re.sub(r'\d+\s+ratings', '', text)
141
+ # Remove "Read more" and similar phrases
142
+ text = re.sub(r'Read more.*$', '', text)
143
+ # Remove empty lines again
144
+ text = re.sub(r'\n\s*\n', '\n', text)
145
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ def process_pdf(pdf_file):
148
+ try:
149
+ # Create a PDF reader object
150
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ # Extract text from all pages
153
+ text = ""
154
+ for page in pdf_reader.pages:
155
  try:
156
+ page_text = page.extract_text()
157
+ if page_text:
158
+ # Clean the text immediately after extraction
159
+ cleaned_page = clean_text(page_text)
160
+ if cleaned_page: # Only add non-empty pages
161
+ text += cleaned_page + "\n"
 
 
 
 
 
 
162
  except Exception as e:
163
+ print(f"Warning: Error extracting text from page: {str(e)}")
164
  continue
165
 
166
+ if not text.strip():
167
+ raise ValueError("No text could be extracted from the PDF")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # Split into chunks
170
+ chunks = split_into_chunks(text)
171
 
172
+ return chunks
 
 
 
 
 
 
 
 
 
 
 
173
  except Exception as e:
174
+ print(f"Error in process_pdf: {str(e)}")
175
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ def split_into_chunks(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
178
+ """
179
+ Split text into overlapping chunks of specified size.
180
 
181
+ Args:
182
+ text (str): The text to split
183
+ chunk_size (int): Maximum size of each chunk
184
+ chunk_overlap (int): Number of characters to overlap between chunks
185
 
186
+ Returns:
187
+ list: List of text chunks
188
+ """
189
+ chunks = []
190
+ start = 0
191
+ text_length = len(text)
192
 
193
+ while start < text_length:
194
+ end = start + chunk_size
 
 
 
 
 
 
 
 
195
 
196
+ if start > 0:
197
+ start = start - chunk_overlap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ if end >= text_length:
200
+ chunks.append(text[start:])
201
+ break
 
 
 
 
 
 
202
 
203
+ if end < text_length:
204
+ paragraph_break = text.rfind('\n\n', start, end)
205
+ if paragraph_break != -1:
206
+ end = paragraph_break
207
+ else:
208
+ sentence_break = text.rfind('. ', start, end)
209
+ if sentence_break != -1:
210
+ end = sentence_break + 1
211
+
212
+ chunks.append(text[start:end].strip())
213
+ start = end
214
 
215
+ return chunks
 
216
 
217
+ # Gradio Interface
218
+ with gr.Blocks(title="TorchAIassist") as demo:
219
+ gr.Markdown("# TorchAIassist")
220
+ gr.Markdown("A chatbot for your documents")
 
 
221
 
222
  with gr.Row():
223
+ file_output = gr.File(
224
+ label="Upload your documents",
225
+ file_count="multiple",
226
+ file_types=[".pdf", ".txt"]
 
 
 
 
 
227
  )
228
+ status = gr.Textbox(label="Status", interactive=False)
229
+
230
+ chatbot = gr.Chatbot(height=600, type="messages")
231
+ msg = gr.Textbox(
232
+ label="Ask a question about your documents",
233
+ placeholder="Let me know what you want to know about your documents"
234
+ )
235
+ clear = gr.Button("Clear")
236
+
237
+ qa_chain = gr.State(None)
238
+
239
+ # Event handlers
240
+ file_output.change(
241
+ fn=process_files_with_status,
242
+ inputs=[file_output],
243
+ outputs=[qa_chain, status]
244
+ )
245
 
246
  msg.submit(
247
+ fn=respond,
248
+ inputs=[msg, chatbot, qa_chain],
249
+ outputs=[msg, chatbot]
250
  )
251
+
252
+ clear.click(lambda: None, None, chatbot, queue=False)
253
 
254
  if __name__ == "__main__":
255
+ demo.launch()