Spaces:
Sleeping
Sleeping
Update unified_document_processor.py
Browse files- unified_document_processor.py +167 -1
unified_document_processor.py
CHANGED
|
@@ -377,4 +377,170 @@ class UnifiedDocumentProcessor:
|
|
| 377 |
return response.choices[0].message.content
|
| 378 |
|
| 379 |
except Exception as e:
|
| 380 |
-
return f"Error processing your question: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
return response.choices[0].message.content
|
| 378 |
|
| 379 |
except Exception as e:
|
| 380 |
+
return f"Error processing your question: {str(e)}"
|
| 381 |
+
def get_detailed_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
| 382 |
+
"""Get detailed context including path and metadata information"""
|
| 383 |
+
try:
|
| 384 |
+
filter_dict = {
|
| 385 |
+
'source_file': {'$in': selected_files}
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
results = self.collection.query(
|
| 389 |
+
query_texts=[question],
|
| 390 |
+
n_results=n_results,
|
| 391 |
+
where=filter_dict,
|
| 392 |
+
include=["documents", "metadatas", "distances"]
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
if not results['documents'][0]:
|
| 396 |
+
return {
|
| 397 |
+
'success': False,
|
| 398 |
+
'error': "No relevant content found"
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
detailed_results = []
|
| 402 |
+
for doc, meta, distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
|
| 403 |
+
result_info = {
|
| 404 |
+
'content': doc,
|
| 405 |
+
'metadata': meta,
|
| 406 |
+
'relevance_score': 1 - distance, # Convert distance to similarity score
|
| 407 |
+
'source_info': {
|
| 408 |
+
'file': meta['source_file'],
|
| 409 |
+
'type': meta['content_type'],
|
| 410 |
+
'path': meta.get('xml_path', 'N/A'), # Only for XML files
|
| 411 |
+
'context': json.loads(meta['context']) if meta.get('context') else {}
|
| 412 |
+
}
|
| 413 |
+
}
|
| 414 |
+
detailed_results.append(result_info)
|
| 415 |
+
|
| 416 |
+
return {
|
| 417 |
+
'success': True,
|
| 418 |
+
'results': detailed_results,
|
| 419 |
+
'query': question
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
except Exception as e:
|
| 423 |
+
return {
|
| 424 |
+
'success': False,
|
| 425 |
+
'error': str(e)
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
def get_hierarchical_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
| 429 |
+
"""Get hierarchical context for XML files including parent-child relationships"""
|
| 430 |
+
try:
|
| 431 |
+
# Get initial results
|
| 432 |
+
initial_results = self.get_detailed_context(question, selected_files, n_results)
|
| 433 |
+
|
| 434 |
+
if not initial_results['success']:
|
| 435 |
+
return initial_results
|
| 436 |
+
|
| 437 |
+
hierarchical_results = []
|
| 438 |
+
for result in initial_results['results']:
|
| 439 |
+
if result['metadata']['content_type'] == 'xml':
|
| 440 |
+
# Get parent elements
|
| 441 |
+
parent_path = '/'.join(result['source_info']['path'].split('/')[:-1])
|
| 442 |
+
if parent_path:
|
| 443 |
+
parent_filter = {
|
| 444 |
+
'source_file': result['metadata']['source_file'],
|
| 445 |
+
'xml_path': parent_path
|
| 446 |
+
}
|
| 447 |
+
parent_results = self.collection.query(
|
| 448 |
+
query_texts=[""], # Empty query to get exact match
|
| 449 |
+
where=parent_filter,
|
| 450 |
+
include=["documents", "metadatas"],
|
| 451 |
+
n_results=1
|
| 452 |
+
)
|
| 453 |
+
if parent_results['documents'][0]:
|
| 454 |
+
result['parent_info'] = {
|
| 455 |
+
'content': parent_results['documents'][0][0],
|
| 456 |
+
'metadata': parent_results['metadatas'][0][0]
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
# Get immediate children
|
| 460 |
+
child_path_prefix = result['source_info']['path'] + '/'
|
| 461 |
+
child_filter = {
|
| 462 |
+
'source_file': result['metadata']['source_file'],
|
| 463 |
+
'xml_path': {'$contains': child_path_prefix}
|
| 464 |
+
}
|
| 465 |
+
child_results = self.collection.query(
|
| 466 |
+
query_texts=[""], # Empty query to get exact matches
|
| 467 |
+
where=child_filter,
|
| 468 |
+
include=["documents", "metadatas"],
|
| 469 |
+
n_results=5
|
| 470 |
+
)
|
| 471 |
+
if child_results['documents'][0]:
|
| 472 |
+
result['children_info'] = [{
|
| 473 |
+
'content': doc,
|
| 474 |
+
'metadata': meta
|
| 475 |
+
} for doc, meta in zip(child_results['documents'][0], child_results['metadatas'][0])]
|
| 476 |
+
|
| 477 |
+
hierarchical_results.append(result)
|
| 478 |
+
|
| 479 |
+
return {
|
| 480 |
+
'success': True,
|
| 481 |
+
'results': hierarchical_results,
|
| 482 |
+
'query': question
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
except Exception as e:
|
| 486 |
+
return {
|
| 487 |
+
'success': False,
|
| 488 |
+
'error': str(e)
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
def get_summary_and_details(self, question: str, selected_files: List[str]) -> Dict:
|
| 492 |
+
"""Get both a summary answer and detailed supporting information"""
|
| 493 |
+
try:
|
| 494 |
+
# Get hierarchical context first
|
| 495 |
+
detailed_results = self.get_hierarchical_context(question, selected_files)
|
| 496 |
+
|
| 497 |
+
if not detailed_results['success']:
|
| 498 |
+
return detailed_results
|
| 499 |
+
|
| 500 |
+
# Create summary prompt
|
| 501 |
+
relevant_content = []
|
| 502 |
+
for result in detailed_results['results']:
|
| 503 |
+
if result['metadata']['content_type'] == 'xml':
|
| 504 |
+
content_info = [
|
| 505 |
+
f"XML Path: {result['source_info']['path']}",
|
| 506 |
+
f"Content: {result['content']}"
|
| 507 |
+
]
|
| 508 |
+
if 'parent_info' in result:
|
| 509 |
+
content_info.append(f"Parent: {result['parent_info']['content']}")
|
| 510 |
+
if 'children_info' in result:
|
| 511 |
+
children_content = [child['content'] for child in result['children_info']]
|
| 512 |
+
content_info.append(f"Related Elements: {', '.join(children_content)}")
|
| 513 |
+
else:
|
| 514 |
+
content_info = [f"Content: {result['content']}"]
|
| 515 |
+
|
| 516 |
+
relevant_content.append('\n'.join(content_info))
|
| 517 |
+
|
| 518 |
+
summary_prompt = f"""Based on the following content, please provide:
|
| 519 |
+
1. A concise answer to the question
|
| 520 |
+
2. Key supporting points
|
| 521 |
+
3. Related context if relevant
|
| 522 |
+
|
| 523 |
+
Question: {question}
|
| 524 |
+
|
| 525 |
+
Content:
|
| 526 |
+
{'\n\n'.join(relevant_content)}
|
| 527 |
+
"""
|
| 528 |
+
|
| 529 |
+
response = self.groq_client.chat.completions.create(
|
| 530 |
+
messages=[{"role": "user", "content": summary_prompt}],
|
| 531 |
+
model="llama3-8b-8192",
|
| 532 |
+
temperature=0.2
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
return {
|
| 536 |
+
'success': True,
|
| 537 |
+
'summary': response.choices[0].message.content,
|
| 538 |
+
'details': detailed_results['results'],
|
| 539 |
+
'query': question
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
except Exception as e:
|
| 543 |
+
return {
|
| 544 |
+
'success': False,
|
| 545 |
+
'error': str(e)
|
| 546 |
+
}
|