""" Document Processing for Case Analysis Supports PDF, TXT, DOCX uploads """ import os import tempfile from typing import Dict, List, Optional import PyPDF2 import docx class DocumentProcessor: def __init__(self): self.supported_extensions = ['.pdf', '.txt', '.docx', '.doc'] def process_uploaded_file(self, file_path: str, file_type: str = None) -> Dict: """ Process uploaded document and extract text Returns: { "success": bool, "filename": str, "text": str, "word_count": int, "extracted_sections": Dict } """ if not os.path.exists(file_path): return {"success": False, "error": "File not found"} try: # Determine file type if not file_type: _, ext = os.path.splitext(file_path) file_type = ext.lower() # Extract text based on file type text = "" if file_type == '.pdf': text = self._extract_from_pdf(file_path) elif file_type == '.txt': with open(file_path, 'r', encoding='utf-8') as f: text = f.read() elif file_type in ['.docx', '.doc']: text = self._extract_from_docx(file_path) else: return {"success": False, "error": f"Unsupported file type: {file_type}"} # Analyze text for homeopathic keywords extracted = self._extract_homeopathic_info(text) return { "success": True, "filename": os.path.basename(file_path), "text": text[:5000], # Limit for display "full_text": text, "word_count": len(text.split()), "extracted_sections": extracted, "summary": self._generate_summary(extracted) } except Exception as e: return {"success": False, "error": str(e)} def _extract_from_pdf(self, file_path: str) -> str: """Extract text from PDF""" text = "" with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() return text def _extract_from_docx(self, file_path: str) -> str: """Extract text from DOCX""" doc = docx.Document(file_path) text = "" for para in doc.paragraphs: text += para.text + "\n" return text def _extract_homeopathic_info(self, text: str) -> Dict: """Extract homeopathic information from text""" text_lower = text.lower() # Common homeopathic sections sections = { "symptoms": [], "modalities": [], "emotional_state": [], "physical_symptoms": [], "timing": [], "generalities": [] } # Keywords to look for keyword_patterns = { "symptoms": ["symptom", "complaint", "pain", "ache", "discomfort"], "modalities": ["worse", "better", "aggravated", "ameliorated", "relieved"], "emotional_state": ["anxious", "fearful", "irritable", "sad", "depressed", "angry"], "timing": ["morning", "evening", "night", "afternoon", "periodic"], "generalities": ["thirst", "hunger", "cold", "hot", "sweat"] } # Extract sentences containing keywords sentences = text.split('.') for sentence in sentences: sentence_lower = sentence.lower() for category, keywords in keyword_patterns.items(): if any(keyword in sentence_lower for keyword in keywords): clean_sentence = sentence.strip() if clean_sentence and len(clean_sentence) > 10: sections[category].append(clean_sentence[:200]) # Limit each section for category in sections: sections[category] = sections[category][:5] return sections def _generate_summary(self, extracted: Dict) -> str: """Generate summary from extracted information""" summary_parts = [] if extracted["symptoms"]: summary_parts.append(f"Chief complaints: {len(extracted['symptoms'])} identified") if extracted["modalities"]: worse_count = sum(1 for s in extracted["modalities"] if "worse" in s.lower()) better_count = sum(1 for s in extracted["modalities"] if "better" in s.lower()) summary_parts.append(f"Modalities: {worse_count} aggravations, {better_count} ameliorations") if extracted["emotional_state"]: summary_parts.append(f"Emotional patterns: {len(extracted['emotional_state'])} noted") return "; ".join(summary_parts) if summary_parts else "No clear patterns identified" def extract_for_analysis(self, text: str) -> Dict: """Extract structured data for analysis""" extracted = self._extract_homeopathic_info(text) # Convert to analysis format analysis_data = { "chief_complaint": " ".join(extracted["symptoms"][:3]) if extracted["symptoms"] else "", "location": "", "sensation": "", "aggravations": "; ".join([s for s in extracted["modalities"] if "worse" in s.lower()][:3]), "ameliorations": "; ".join([s for s in extracted["modalities"] if "better" in s.lower()][:3]), "timing": "; ".join(extracted["timing"][:3]), "emotional_state": "; ".join(extracted["emotional_state"][:3]), "generalities": "; ".join(extracted["generalities"][:3]), "source": "document_upload" } return analysis_data # Global instance doc_processor = DocumentProcessor()