Spaces:
Running
Running
Jatin Mehra commited on
Commit ·
4a31622
1
Parent(s): 24b32e6
Enhance PDF chunking logic and add validation for chat requests to improve data integrity and user experience
Browse files
app.py
CHANGED
|
@@ -178,16 +178,7 @@ async def upload_pdf(
|
|
| 178 |
print("Warning: TAVILY_API_KEY is not set. Web search will not function.")
|
| 179 |
|
| 180 |
documents = process_pdf_file(file_path)
|
| 181 |
-
|
| 182 |
-
# The value 1500 might be too large if estimate_tokens is text_len // 4, as it means ~6000 characters.
|
| 183 |
-
# Let's use a smaller max_length for chunks for better granularity in RAG retrieval.
|
| 184 |
-
# For `bge-large-en-v1.5` (max sequence length 512 tokens), chunks around 250-400 tokens are often good.
|
| 185 |
-
# If estimate_tokens is len(text)//4, then max_length of 250 tokens is roughly 1000 characters.
|
| 186 |
-
# Let's use max_length=256 (tokens) for chunker config, so about 1024 characters.
|
| 187 |
-
# The chunk_text function uses max_length as character count / 4. So if we want 256 tokens, max_length = 256*4 = 1024
|
| 188 |
-
# However, the current chunk_text logic is `estimate_tokens(current_chunk + paragraph) <= max_length // 4`.
|
| 189 |
-
# This means `max_length` is already considered a token limit. So `max_length=256` (tokens) is the target.
|
| 190 |
-
chunks_with_metadata = chunk_text(documents, max_length=256) # max_length in tokens
|
| 191 |
|
| 192 |
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
|
| 193 |
embeddings, _ = create_embeddings(chunks_with_metadata, embedding_model) # Chunks are already with metadata
|
|
@@ -222,11 +213,25 @@ async def upload_pdf(
|
|
| 222 |
# Route to chat with the document
|
| 223 |
@app.post("/chat")
|
| 224 |
async def chat(request: ChatRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
session, found = load_session(request.session_id, model_name=request.model_name)
|
| 226 |
if not found:
|
| 227 |
raise HTTPException(status_code=404, detail="Session not found or expired. Please upload a document first.")
|
| 228 |
|
| 229 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# Per-request memory to ensure chat history is correctly loaded for the agent
|
| 231 |
agent_memory = ConversationBufferMemory(memory_key="chat_history", input_key="input", return_messages=True)
|
| 232 |
for entry in session.get("chat_history", []):
|
|
@@ -237,15 +242,14 @@ async def chat(request: ChatRequest):
|
|
| 237 |
current_request_tools = []
|
| 238 |
|
| 239 |
# 1. Add the document-specific vector search tool
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
print(f"Warning: Session {request.session_id} missing data for vector_database_search tool.")
|
| 249 |
|
| 250 |
# 2. Conditionally add Tavily (web search) tool
|
| 251 |
if request.use_search:
|
|
@@ -270,6 +274,10 @@ async def chat(request: ChatRequest):
|
|
| 270 |
k=5 # Number of chunks for initial context
|
| 271 |
)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
response = agentic_rag(
|
| 274 |
session["llm"],
|
| 275 |
current_request_tools, # Pass the dynamically assembled list of tools
|
|
@@ -280,6 +288,8 @@ async def chat(request: ChatRequest):
|
|
| 280 |
)
|
| 281 |
|
| 282 |
response_output = response.get("output", "Sorry, I could not generate a response.")
|
|
|
|
|
|
|
| 283 |
session["chat_history"].append({"user": request.query, "assistant": response_output})
|
| 284 |
save_session(request.session_id, session) # Save updated history and potentially other modified session state
|
| 285 |
|
|
|
|
| 178 |
print("Warning: TAVILY_API_KEY is not set. Web search will not function.")
|
| 179 |
|
| 180 |
documents = process_pdf_file(file_path)
|
| 181 |
+
chunks_with_metadata = chunk_text(documents, max_length=1000) # Increased from 256 to 1000 tokens for better context
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
|
| 184 |
embeddings, _ = create_embeddings(chunks_with_metadata, embedding_model) # Chunks are already with metadata
|
|
|
|
| 213 |
# Route to chat with the document
|
| 214 |
@app.post("/chat")
|
| 215 |
async def chat(request: ChatRequest):
|
| 216 |
+
# Validate query
|
| 217 |
+
if not request.query or not request.query.strip():
|
| 218 |
+
raise HTTPException(status_code=400, detail="Query cannot be empty")
|
| 219 |
+
|
| 220 |
+
if len(request.query.strip()) < 3:
|
| 221 |
+
raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
|
| 222 |
+
|
| 223 |
session, found = load_session(request.session_id, model_name=request.model_name)
|
| 224 |
if not found:
|
| 225 |
raise HTTPException(status_code=404, detail="Session not found or expired. Please upload a document first.")
|
| 226 |
|
| 227 |
try:
|
| 228 |
+
# Validate session data integrity
|
| 229 |
+
required_keys = ["index", "chunks", "model", "llm"]
|
| 230 |
+
missing_keys = [key for key in required_keys if key not in session]
|
| 231 |
+
if missing_keys:
|
| 232 |
+
print(f"Warning: Session {request.session_id} missing required data: {missing_keys}")
|
| 233 |
+
raise HTTPException(status_code=500, detail="Session data is incomplete. Please upload the document again.")
|
| 234 |
+
|
| 235 |
# Per-request memory to ensure chat history is correctly loaded for the agent
|
| 236 |
agent_memory = ConversationBufferMemory(memory_key="chat_history", input_key="input", return_messages=True)
|
| 237 |
for entry in session.get("chat_history", []):
|
|
|
|
| 242 |
current_request_tools = []
|
| 243 |
|
| 244 |
# 1. Add the document-specific vector search tool
|
| 245 |
+
vector_search_tool_instance = create_vector_search_tool(
|
| 246 |
+
faiss_index=session["index"],
|
| 247 |
+
document_chunks_with_metadata=session["chunks"], # Pass the correct variable
|
| 248 |
+
embedding_model=session["model"], # This is the SentenceTransformer model
|
| 249 |
+
max_chunk_length=1000,
|
| 250 |
+
k=10
|
| 251 |
+
)
|
| 252 |
+
current_request_tools.append(vector_search_tool_instance)
|
|
|
|
| 253 |
|
| 254 |
# 2. Conditionally add Tavily (web search) tool
|
| 255 |
if request.use_search:
|
|
|
|
| 274 |
k=5 # Number of chunks for initial context
|
| 275 |
)
|
| 276 |
|
| 277 |
+
print(f"Query: '{request.query}' - Found {len(initial_similar_chunks)} initial chunks")
|
| 278 |
+
if initial_similar_chunks:
|
| 279 |
+
print(f"Best chunk score: {initial_similar_chunks[0][1]:.4f}")
|
| 280 |
+
|
| 281 |
response = agentic_rag(
|
| 282 |
session["llm"],
|
| 283 |
current_request_tools, # Pass the dynamically assembled list of tools
|
|
|
|
| 288 |
)
|
| 289 |
|
| 290 |
response_output = response.get("output", "Sorry, I could not generate a response.")
|
| 291 |
+
print(f"Generated response length: {len(response_output)} characters")
|
| 292 |
+
|
| 293 |
session["chat_history"].append({"user": request.query, "assistant": response_output})
|
| 294 |
save_session(request.session_id, session) # Save updated history and potentially other modified session state
|
| 295 |
|