Homoeopathy-Bot / document_processor.py
yekkala's picture
Create document_processor.py
ddfb91f verified
"""
Document Processing for Case Analysis
Supports PDF, TXT, DOCX uploads
"""
import os
import tempfile
from typing import Dict, List, Optional
import PyPDF2
import docx
class DocumentProcessor:
def __init__(self):
self.supported_extensions = ['.pdf', '.txt', '.docx', '.doc']
def process_uploaded_file(self, file_path: str, file_type: str = None) -> Dict:
"""
Process uploaded document and extract text
Returns: {
"success": bool,
"filename": str,
"text": str,
"word_count": int,
"extracted_sections": Dict
}
"""
if not os.path.exists(file_path):
return {"success": False, "error": "File not found"}
try:
# Determine file type
if not file_type:
_, ext = os.path.splitext(file_path)
file_type = ext.lower()
# Extract text based on file type
text = ""
if file_type == '.pdf':
text = self._extract_from_pdf(file_path)
elif file_type == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
elif file_type in ['.docx', '.doc']:
text = self._extract_from_docx(file_path)
else:
return {"success": False, "error": f"Unsupported file type: {file_type}"}
# Analyze text for homeopathic keywords
extracted = self._extract_homeopathic_info(text)
return {
"success": True,
"filename": os.path.basename(file_path),
"text": text[:5000], # Limit for display
"full_text": text,
"word_count": len(text.split()),
"extracted_sections": extracted,
"summary": self._generate_summary(extracted)
}
except Exception as e:
return {"success": False, "error": str(e)}
def _extract_from_pdf(self, file_path: str) -> str:
"""Extract text from PDF"""
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def _extract_from_docx(self, file_path: str) -> str:
"""Extract text from DOCX"""
doc = docx.Document(file_path)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
def _extract_homeopathic_info(self, text: str) -> Dict:
"""Extract homeopathic information from text"""
text_lower = text.lower()
# Common homeopathic sections
sections = {
"symptoms": [],
"modalities": [],
"emotional_state": [],
"physical_symptoms": [],
"timing": [],
"generalities": []
}
# Keywords to look for
keyword_patterns = {
"symptoms": ["symptom", "complaint", "pain", "ache", "discomfort"],
"modalities": ["worse", "better", "aggravated", "ameliorated", "relieved"],
"emotional_state": ["anxious", "fearful", "irritable", "sad", "depressed", "angry"],
"timing": ["morning", "evening", "night", "afternoon", "periodic"],
"generalities": ["thirst", "hunger", "cold", "hot", "sweat"]
}
# Extract sentences containing keywords
sentences = text.split('.')
for sentence in sentences:
sentence_lower = sentence.lower()
for category, keywords in keyword_patterns.items():
if any(keyword in sentence_lower for keyword in keywords):
clean_sentence = sentence.strip()
if clean_sentence and len(clean_sentence) > 10:
sections[category].append(clean_sentence[:200])
# Limit each section
for category in sections:
sections[category] = sections[category][:5]
return sections
def _generate_summary(self, extracted: Dict) -> str:
"""Generate summary from extracted information"""
summary_parts = []
if extracted["symptoms"]:
summary_parts.append(f"Chief complaints: {len(extracted['symptoms'])} identified")
if extracted["modalities"]:
worse_count = sum(1 for s in extracted["modalities"] if "worse" in s.lower())
better_count = sum(1 for s in extracted["modalities"] if "better" in s.lower())
summary_parts.append(f"Modalities: {worse_count} aggravations, {better_count} ameliorations")
if extracted["emotional_state"]:
summary_parts.append(f"Emotional patterns: {len(extracted['emotional_state'])} noted")
return "; ".join(summary_parts) if summary_parts else "No clear patterns identified"
def extract_for_analysis(self, text: str) -> Dict:
"""Extract structured data for analysis"""
extracted = self._extract_homeopathic_info(text)
# Convert to analysis format
analysis_data = {
"chief_complaint": " ".join(extracted["symptoms"][:3]) if extracted["symptoms"] else "",
"location": "",
"sensation": "",
"aggravations": "; ".join([s for s in extracted["modalities"] if "worse" in s.lower()][:3]),
"ameliorations": "; ".join([s for s in extracted["modalities"] if "better" in s.lower()][:3]),
"timing": "; ".join(extracted["timing"][:3]),
"emotional_state": "; ".join(extracted["emotional_state"][:3]),
"generalities": "; ".join(extracted["generalities"][:3]),
"source": "document_upload"
}
return analysis_data
# Global instance
doc_processor = DocumentProcessor()