Spaces:
Sleeping
Sleeping
File size: 6,048 Bytes
ddfb91f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """
Document Processing for Case Analysis
Supports PDF, TXT, DOCX uploads
"""
import os
import tempfile
from typing import Dict, List, Optional
import PyPDF2
import docx
class DocumentProcessor:
def __init__(self):
self.supported_extensions = ['.pdf', '.txt', '.docx', '.doc']
def process_uploaded_file(self, file_path: str, file_type: str = None) -> Dict:
"""
Process uploaded document and extract text
Returns: {
"success": bool,
"filename": str,
"text": str,
"word_count": int,
"extracted_sections": Dict
}
"""
if not os.path.exists(file_path):
return {"success": False, "error": "File not found"}
try:
# Determine file type
if not file_type:
_, ext = os.path.splitext(file_path)
file_type = ext.lower()
# Extract text based on file type
text = ""
if file_type == '.pdf':
text = self._extract_from_pdf(file_path)
elif file_type == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
elif file_type in ['.docx', '.doc']:
text = self._extract_from_docx(file_path)
else:
return {"success": False, "error": f"Unsupported file type: {file_type}"}
# Analyze text for homeopathic keywords
extracted = self._extract_homeopathic_info(text)
return {
"success": True,
"filename": os.path.basename(file_path),
"text": text[:5000], # Limit for display
"full_text": text,
"word_count": len(text.split()),
"extracted_sections": extracted,
"summary": self._generate_summary(extracted)
}
except Exception as e:
return {"success": False, "error": str(e)}
def _extract_from_pdf(self, file_path: str) -> str:
"""Extract text from PDF"""
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def _extract_from_docx(self, file_path: str) -> str:
"""Extract text from DOCX"""
doc = docx.Document(file_path)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
def _extract_homeopathic_info(self, text: str) -> Dict:
"""Extract homeopathic information from text"""
text_lower = text.lower()
# Common homeopathic sections
sections = {
"symptoms": [],
"modalities": [],
"emotional_state": [],
"physical_symptoms": [],
"timing": [],
"generalities": []
}
# Keywords to look for
keyword_patterns = {
"symptoms": ["symptom", "complaint", "pain", "ache", "discomfort"],
"modalities": ["worse", "better", "aggravated", "ameliorated", "relieved"],
"emotional_state": ["anxious", "fearful", "irritable", "sad", "depressed", "angry"],
"timing": ["morning", "evening", "night", "afternoon", "periodic"],
"generalities": ["thirst", "hunger", "cold", "hot", "sweat"]
}
# Extract sentences containing keywords
sentences = text.split('.')
for sentence in sentences:
sentence_lower = sentence.lower()
for category, keywords in keyword_patterns.items():
if any(keyword in sentence_lower for keyword in keywords):
clean_sentence = sentence.strip()
if clean_sentence and len(clean_sentence) > 10:
sections[category].append(clean_sentence[:200])
# Limit each section
for category in sections:
sections[category] = sections[category][:5]
return sections
def _generate_summary(self, extracted: Dict) -> str:
"""Generate summary from extracted information"""
summary_parts = []
if extracted["symptoms"]:
summary_parts.append(f"Chief complaints: {len(extracted['symptoms'])} identified")
if extracted["modalities"]:
worse_count = sum(1 for s in extracted["modalities"] if "worse" in s.lower())
better_count = sum(1 for s in extracted["modalities"] if "better" in s.lower())
summary_parts.append(f"Modalities: {worse_count} aggravations, {better_count} ameliorations")
if extracted["emotional_state"]:
summary_parts.append(f"Emotional patterns: {len(extracted['emotional_state'])} noted")
return "; ".join(summary_parts) if summary_parts else "No clear patterns identified"
def extract_for_analysis(self, text: str) -> Dict:
"""Extract structured data for analysis"""
extracted = self._extract_homeopathic_info(text)
# Convert to analysis format
analysis_data = {
"chief_complaint": " ".join(extracted["symptoms"][:3]) if extracted["symptoms"] else "",
"location": "",
"sensation": "",
"aggravations": "; ".join([s for s in extracted["modalities"] if "worse" in s.lower()][:3]),
"ameliorations": "; ".join([s for s in extracted["modalities"] if "better" in s.lower()][:3]),
"timing": "; ".join(extracted["timing"][:3]),
"emotional_state": "; ".join(extracted["emotional_state"][:3]),
"generalities": "; ".join(extracted["generalities"][:3]),
"source": "document_upload"
}
return analysis_data
# Global instance
doc_processor = DocumentProcessor() |