Spaces:

CaffeinatedCoding
/

ReportRaahat

Running

ReportRaahat / backend /app /routers /analyze.py

ReportRaahat CI

Deploy from GitHub: cbc36259c5ce4062cd4e64b876308f9378e3ebe2

542c765 9 days ago

14.5 kB

	import io
	import re
	import random
	from fastapi import APIRouter, UploadFile, File, Form, HTTPException
	from app.schemas import AnalyzeResponse, Finding
	from app.mock_data import MOCK_CASES
	from app.ml.rag import retrieve_reference_range, determine_status_vs_india
	from app.ml.model import simplify_finding

	router = APIRouter()


	def extract_text_from_upload(file_bytes: bytes, content_type: str) -> str:
	"""Extract raw text from uploaded image or PDF using multiple methods."""
	text = ""

	if "pdf" in content_type:
	try:
	import pdfplumber
	print(f"[DEBUG] Attempting pdfplumber extraction on {len(file_bytes)} bytes PDF")
	with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
	print(f"[DEBUG] PDF has {len(pdf.pages)} pages")
	# Extract text directly from PDF
	for idx, page in enumerate(pdf.pages):
	page_text = page.extract_text()
	if page_text:
	print(f"[DEBUG] Page {idx}: extracted {len(page_text)} chars")
	text += page_text + "\n"

	# Also try extract_text with layout if direct method got little
	if not page_text or len(page_text.strip()) < 50:
	try:
	layout_text = page.extract_text(layout=True)
	if layout_text and len(layout_text) > len(page_text or ""):
	print(f"[DEBUG] Page {idx}: layout extraction better ({len(layout_text)} chars)")
	text = text.replace(page_text + "\n", "") if page_text else text
	text += layout_text + "\n"
	except:
	pass

	print(f"[DEBUG] Total text extracted via pdfplumber: {len(text)} chars")
	except Exception as e:
	print(f"[DEBUG] pdfplumber error: {e}")

	# Fallback: Extract text via character-level analysis if direct method failed
	if not text or len(text.strip()) < 20:
	try:
	import pdfplumber
	print(f"[DEBUG] Fallback: Attempting character-level extraction")
	with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
	for idx, page in enumerate(pdf.pages):
	chars = page.chars
	if chars:
	page_text = "".join([c['text'] for c in chars])
	print(f"[DEBUG] Page {idx}: char extraction got {len(page_text)} chars")
	text += page_text + " "

	print(f"[DEBUG] Character-level extraction: {len(text)} chars total")
	except Exception as e:
	print(f"[DEBUG] Character-level extraction error: {e}")

	elif "image" in content_type:
	print(f"[DEBUG] Image detected, attempting pytesseract OCR")
	try:
	import pytesseract
	from PIL import Image
	img = Image.open(io.BytesIO(file_bytes))
	text = pytesseract.image_to_string(img)
	print(f"[DEBUG] OCR extracted: {len(text)} chars")
	except Exception as e:
	print(f"[DEBUG] OCR error (Tesseract may not be installed): {e}")

	print(f"[DEBUG] Final extracted text: {len(text)} chars. Content preview: {text[:100]}")
	return text.strip()


	def parse_lab_values(text: str) -> list[dict]:
	"""
	Extract lab test name, value, unit from raw report text.
	Handles complete line format: parameter VALUE UNIT reference STATUS
	"""
	findings = []

	lines = text.split('\n')
	seen = set()

	for line in lines:
	line = line.strip()
	if not line or len(line) < 15:
	continue

	# Skip headers and metadata
	if any(skip in line.upper() for skip in [
	'INVESTIGATION', 'PATIENT', 'LAB', 'REPORT', 'DATE', 'ACCREDITED',
	'REF.', 'DISCLAIMER', 'INTERPRETATION', 'METROPOLIS', 'NABL', 'ISO'
	]):
	continue

	# Pattern for lines like: "Haemoglobin (Hb) 9.2 g/dL 13.0 - 17.0 LOW"
	# Parameter can have letters, spaces, digits, parens, dashes
	# Value: integer or decimal
	# Unit: letters/digits/symbols
	# Rest: ignored (reference range and status)
	match = re.match(
	r'^([A-Za-z0-9\s\/\-]{3,45}?)\s+([0-9]{1,4}(?:\.[0-9]{1,2})?)\s+([a-zA-Z/\.\\%µ\-0-9]+)(?:\s+.*)?$',
	line,
	re.IGNORECASE
	)

	if match:
	param = match.group(1).strip()
	value = match.group(2).strip()
	unit = match.group(3).strip().rstrip('/ ')

	# Clean parameter: remove incomplete parentheses notation
	# "Haemoglobin (Hb" -> "Haemoglobin"
	# "Haematocrit (PCV" -> "Haematocrit"
	if '(' in param and not ')' in param:
	param = param[:param.index('(')].strip()

	# Skip noise parameters
	if len(param) < 2 or param.lower() in seen:
	continue
	if any(skip in param.lower() for skip in [
	'age', 'sex', 'years', 'male', 'female', 'collected', 'hours', 'times', 'name'
	]):
	continue

	# Unit must have at least one letter or valid symbol
	if not any(c.isalpha() or c in '/%µ-' for c in unit):
	continue

	seen.add(param.lower())
	findings.append({
	"parameter": param,
	"value": value,
	"unit": unit
	})

	return findings[:50] # Max 50 findings per report


	def detect_organs(findings: list[dict]) -> list[str]:
	"""Map lab tests to affected organ systems."""
	organ_map = {
	"LIVER": ["sgpt", "sgot", "alt", "ast", "bilirubin", "albumin", "ggt", "alkaline phosphatase"],
	"KIDNEY": ["creatinine", "urea", "bun", "uric acid", "egfr", "potassium", "sodium"],
	"BLOOD": ["hemoglobin", "hb", "rbc", "wbc", "platelet", "hematocrit", "mcv", "mch"],
	"HEART": ["troponin", "ck-mb", "ldh", "cholesterol", "triglyceride", "ldl", "hdl"],
	"THYROID": ["tsh", "t3", "t4", "free t3", "free t4"],
	"DIABETES": ["glucose", "hba1c", "blood sugar", "fasting sugar"],
	"SYSTEMIC": ["vitamin d", "vitamin b12", "ferritin", "crp", "esr", "folate"],
	}

	detected = set()
	for finding in findings:
	# Handle both dict and Pydantic object
	if isinstance(finding, dict):
	name_lower = finding.get("parameter", "").lower()
	else:
	name_lower = getattr(finding, "parameter", "").lower()

	for organ, keywords in organ_map.items():
	if any(kw in name_lower for kw in keywords):
	detected.add(organ)

	return list(detected) if detected else ["SYSTEMIC"]


	@router.post("/analyze", response_model=AnalyzeResponse)
	async def analyze_report(
	file: UploadFile = File(...),
	language: str = Form(default="EN")
	):
	file_bytes = await file.read()
	content_type = file.content_type or "image/jpeg"

	# Step 1: Extract text from image/PDF
	raw_text = extract_text_from_upload(file_bytes, content_type)

	if not raw_text or len(raw_text.strip()) < 20:
	return AnalyzeResponse(
	is_readable=False,
	report_type="UNKNOWN",
	findings=[],
	affected_organs=[],
	overall_summary_hindi="यह छवि पढ़ने में असमर्थ। कृपया एक स्पष्ट फोटो लें।",
	overall_summary_english="Could not read this image. Please upload a clearer photo of the report.",
	severity_level="NORMAL",
	dietary_flags=[],
	exercise_flags=[],
	ai_confidence_score=0.0,
	grounded_in="N/A",
	disclaimer="Please consult a doctor for proper medical advice."
	)

	# Step 2: Parse lab values from text
	raw_findings = parse_lab_values(raw_text)

	if not raw_findings:
	# Fallback to mock data if parsing fails
	return random.choice(MOCK_CASES)

	# Step 3: For each finding — RAG retrieval + model simplification
	processed_findings = []
	severity_scores = []

	for raw in raw_findings:
	try:
	param = raw["parameter"]
	value_str = raw["value"]
	unit = raw["unit"]

	# RAG: get Indian population reference range
	ref = retrieve_reference_range(param, unit)
	pop_mean = ref.get("population_mean")
	pop_std = ref.get("population_std")

	# Determine status
	try:
	val_float = float(value_str)
	if pop_mean and pop_std:
	if val_float < pop_mean - pop_std:
	status = "LOW"
	severity_scores.append(2)
	elif val_float > pop_mean + pop_std * 2:
	status = "CRITICAL"
	severity_scores.append(4)
	elif val_float > pop_mean + pop_std:
	status = "HIGH"
	severity_scores.append(3)
	else:
	status = "NORMAL"
	severity_scores.append(1)
	else:
	status = "NORMAL"
	severity_scores.append(1)
	except ValueError:
	status = "NORMAL"
	severity_scores.append(1)

	status_str = (
	f"Indian population average: {pop_mean} {unit}"
	if pop_mean else "Reference data from Indian population"
	)

	# Model: simplify the finding
	simplified = simplify_finding(param, value_str, unit, status, status_str)

	processed_findings.append(Finding(
	parameter=param,
	value=value_str,
	unit=unit,
	status=status,
	simple_name_hindi=param,
	simple_name_english=param,
	layman_explanation_hindi=simplified["hindi"],
	layman_explanation_english=simplified["english"],
	indian_population_mean=pop_mean,
	indian_population_std=pop_std,
	status_vs_india=status_str,
	normal_range=f"{ref.get('p5', 'N/A')} - {ref.get('p95', 'N/A')} {unit}"
	))

	except Exception as e:
	print(f"Error processing finding {raw}: {e}")
	continue

	if not processed_findings:
	return random.choice(MOCK_CASES)

	# Step 4: Determine overall severity
	max_score = max(severity_scores) if severity_scores else 1
	severity_map = {1: "NORMAL", 2: "MILD_CONCERN", 3: "MODERATE_CONCERN", 4: "URGENT"}
	severity_level = severity_map.get(max_score, "NORMAL")

	# Step 5: Detect affected organs
	affected_organs = detect_organs(processed_findings)

	# Step 6: Generate dietary/exercise flags
	dietary_flags = []
	exercise_flags = []

	for f in processed_findings:
	name_lower = f.parameter.lower()
	if "hemoglobin" in name_lower or "iron" in name_lower:
	dietary_flags.append("INCREASE_IRON")
	if "vitamin d" in name_lower:
	dietary_flags.append("INCREASE_VITAMIN_D")
	if "vitamin b12" in name_lower:
	dietary_flags.append("INCREASE_VITAMIN_B12")
	if "cholesterol" in name_lower or "ldl" in name_lower:
	dietary_flags.append("AVOID_FATTY_FOODS")
	if "glucose" in name_lower or "sugar" in name_lower or "hba1c" in name_lower:
	dietary_flags.append("AVOID_SUGAR")
	if "creatinine" in name_lower or "urea" in name_lower:
	dietary_flags.append("REDUCE_PROTEIN")
	if "sgpt" in name_lower or "sgot" in name_lower or "bilirubin" in name_lower:
	exercise_flags.append("LIGHT_WALKING_ONLY")

	if not exercise_flags:
	if severity_level in ["MODERATE_CONCERN", "URGENT"]:
	exercise_flags = ["LIGHT_WALKING_ONLY"]
	else:
	exercise_flags = ["NORMAL_ACTIVITY"]

	dietary_flags = list(set(dietary_flags))

	# Step 7: Confidence score based on how many findings were grounded
	grounded_count = sum(1 for f in processed_findings if f.indian_population_mean)
	confidence = min(95.0, 60.0 + (grounded_count / max(len(processed_findings), 1)) * 35.0)

	# Step 8: Overall summaries
	abnormal = [f for f in processed_findings if f.status in ["HIGH", "LOW", "CRITICAL"]]
	if abnormal:
	hindi_summary = f"आपकी रिपोर्ट में {len(abnormal)} असामान्य मान पाए गए। {abnormal[0].layman_explanation_hindi} डॉक्टर से मिलें।"
	english_summary = f"Your report shows {len(abnormal)} abnormal values. {abnormal[0].layman_explanation_english} Please consult your doctor."
	else:
	hindi_summary = "आपकी सभी जांच सामान्य हैं। अपना स्वास्थ्य ऐसे ही बनाए रखें।"
	english_summary = "All your test values appear to be within normal range. Keep up your healthy lifestyle."

	return AnalyzeResponse(
	is_readable=True,
	report_type="LAB_REPORT",
	findings=processed_findings,
	affected_organs=affected_organs,
	overall_summary_hindi=hindi_summary,
	overall_summary_english=english_summary,
	severity_level=severity_level,
	dietary_flags=dietary_flags,
	exercise_flags=exercise_flags,
	ai_confidence_score=round(confidence, 1),
	grounded_in="Fine-tuned Flan-T5-small + FAISS over NidaanKosha 100K Indian lab readings",
	disclaimer="This is an AI-assisted analysis. It is not a medical diagnosis. Please consult a qualified doctor."
	)


	@router.get("/mock-analyze", response_model=AnalyzeResponse)
	async def mock_analyze(case: int = None):
	"""Returns mock data for frontend development. case=0,1,2"""
	if case is not None and 0 <= case < len(MOCK_CASES):
	return MOCK_CASES[case]
	return random.choice(MOCK_CASES)