Spaces:

T0X1N
/

Agentic-RagBot

Sleeping

Nikhil Pravin Pise

refactor: major repository cleanup and bug fixes

6dc9d46 about 2 months ago

23.1 kB

	"""
	MediGuard AI RAG-Helper - Interactive CLI Chatbot
	Enables natural language conversation with the RAG system
	"""

	import json
	import sys
	import os
	from pathlib import Path
	from typing import Dict, Any, Tuple
	from datetime import datetime

	# Set UTF-8 encoding for Windows console
	if sys.platform == 'win32':
	try:
	sys.stdout.reconfigure(encoding='utf-8')
	sys.stderr.reconfigure(encoding='utf-8')
	except:
	# Fallback for older Python versions
	import codecs
	sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
	sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')
	# Set console to UTF-8
	os.system('chcp 65001 > nul 2>&1')

	# Add parent directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from langchain_core.prompts import ChatPromptTemplate
	from src.llm_config import get_chat_model
	from src.workflow import create_guild
	from src.state import PatientInput


	# ============================================================================
	# BIOMARKER EXTRACTION PROMPT
	# ============================================================================

	BIOMARKER_EXTRACTION_PROMPT = """You are a medical data extraction assistant.
	Extract biomarker values from the user's message.

	Known biomarkers (24 total):
	Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI,
	Hemoglobin, Platelets, WBC (White Blood Cells), RBC (Red Blood Cells),
	Hematocrit, MCV, MCH, MCHC, Heart Rate, Systolic BP, Diastolic BP,
	Troponin, C-reactive Protein, ALT, AST, Creatinine

	User message: {user_message}

	Extract all biomarker names and their values. Return ONLY valid JSON (no other text):
	{{
	"biomarkers": {{
	"Glucose": 140,
	"HbA1c": 7.5
	}},
	"patient_context": {{
	"age": null,
	"gender": null,
	"bmi": null
	}}
	}}

	If you cannot find any biomarkers, return {{"biomarkers": {{}}, "patient_context": {{}}}}.
	"""


	# ============================================================================
	# Component 1: Biomarker Extraction
	# ============================================================================

	def normalize_biomarker_name(name: str) -> str:
	"""Normalize biomarker names to standard format matching biomarker_references.json"""
	name_lower = name.lower().replace(" ", "").replace("-", "").replace("_", "")

	# Mapping of variations to standard names (matching biomarker_references.json)
	mappings = {
	"glucose": "Glucose",
	"bloodsugar": "Glucose",
	"bloodglucose": "Glucose",
	"cholesterol": "Cholesterol",
	"totalcholesterol": "Cholesterol",
	"triglycerides": "Triglycerides",
	"trig": "Triglycerides",
	"hba1c": "HbA1c",
	"a1c": "HbA1c",
	"hemoglobina1c": "HbA1c",
	"ldl": "LDL Cholesterol",
	"ldlcholesterol": "LDL Cholesterol",
	"hdl": "HDL Cholesterol",
	"hdlcholesterol": "HDL Cholesterol",
	"insulin": "Insulin",
	"bmi": "BMI",
	"bodymassindex": "BMI",
	"hemoglobin": "Hemoglobin",
	"hgb": "Hemoglobin",
	"hb": "Hemoglobin",
	"platelets": "Platelets",
	"plt": "Platelets",
	"wbc": "White Blood Cells",
	"whitebloodcells": "White Blood Cells",
	"whitecells": "White Blood Cells",
	"rbc": "Red Blood Cells",
	"redbloodcells": "Red Blood Cells",
	"redcells": "Red Blood Cells",
	"hematocrit": "Hematocrit",
	"hct": "Hematocrit",
	"mcv": "Mean Corpuscular Volume",
	"meancorpuscularvolume": "Mean Corpuscular Volume",
	"mch": "Mean Corpuscular Hemoglobin",
	"meancorpuscularhemoglobin": "Mean Corpuscular Hemoglobin",
	"mchc": "Mean Corpuscular Hemoglobin Concentration",
	"heartrate": "Heart Rate",
	"hr": "Heart Rate",
	"pulse": "Heart Rate",
	"systolicbp": "Systolic Blood Pressure",
	"systolic": "Systolic Blood Pressure",
	"sbp": "Systolic Blood Pressure",
	"diastolicbp": "Diastolic Blood Pressure",
	"diastolic": "Diastolic Blood Pressure",
	"dbp": "Diastolic Blood Pressure",
	"troponin": "Troponin",
	"creactiveprotein": "C-reactive Protein",
	"crp": "C-reactive Protein",
	"alt": "ALT",
	"alanineaminotransferase": "ALT",
	"ast": "AST",
	"aspartateaminotransferase": "AST",
	"creatinine": "Creatinine",
	}

	return mappings.get(name_lower, name)


	def extract_biomarkers(user_message: str) -> Tuple[Dict[str, float], Dict[str, Any]]:
	"""
	Extract biomarker values from natural language using LLM.

	Returns:
	Tuple of (biomarkers_dict, patient_context_dict)
	"""
	try:
	print(f" [DEBUG] Extracting from: '{user_message[:50]}...'")
	llm = get_chat_model(temperature=0.0)
	prompt = ChatPromptTemplate.from_template(BIOMARKER_EXTRACTION_PROMPT)

	chain = prompt \| llm
	response = chain.invoke({"user_message": user_message})

	# Parse JSON from LLM response
	content = response.content.strip()
	print(f" [DEBUG] LLM response: {content[:200]}...")

	# Try to extract JSON if wrapped in markdown code blocks
	if "```json" in content:
	content = content.split("```json")[1].split("```")[0].strip()
	elif "```" in content:
	content = content.split("```")[1].split("```")[0].strip()

	extracted = json.loads(content)
	biomarkers = extracted.get("biomarkers", {})
	patient_context = extracted.get("patient_context", {})

	print(f" [DEBUG] Extracted biomarkers: {biomarkers}")
	print(f" [DEBUG] Patient context: {patient_context}")

	# Normalize biomarker names
	normalized = {}
	for key, value in biomarkers.items():
	try:
	standard_name = normalize_biomarker_name(key)
	normalized[standard_name] = float(value)
	print(f" [DEBUG] Normalized '{key}' -> '{standard_name}' = {value}")
	except (ValueError, TypeError) as e:
	print(f"⚠️ Skipping invalid value for {key}: {value} (error: {e})")
	continue

	# Clean up patient context (remove null values)
	patient_context = {k: v for k, v in patient_context.items() if v is not None}

	print(f" [DEBUG] Final normalized: {normalized}")
	return normalized, patient_context

	except Exception as e:
	print(f"⚠️ Extraction failed: {e}")
	import traceback
	traceback.print_exc()
	return {}, {}


	# ============================================================================
	# Component 2: Disease Prediction
	# ============================================================================

	def predict_disease_simple(biomarkers: Dict[str, float]) -> Dict[str, Any]:
	"""
	Simple rule-based disease prediction based on key biomarkers.
	"""
	scores = {
	"Diabetes": 0.0,
	"Anemia": 0.0,
	"Heart Disease": 0.0,
	"Thrombocytopenia": 0.0,
	"Thalassemia": 0.0
	}

	# Diabetes indicators
	glucose = biomarkers.get("Glucose", 0)
	hba1c = biomarkers.get("HbA1c", 0)
	if glucose > 126:
	scores["Diabetes"] += 0.4
	if glucose > 180:
	scores["Diabetes"] += 0.2
	if hba1c >= 6.5:
	scores["Diabetes"] += 0.5

	# Anemia indicators
	hemoglobin = biomarkers.get("Hemoglobin", 0)
	mcv = biomarkers.get("MCV", 0)
	if hemoglobin < 12.0:
	scores["Anemia"] += 0.6
	if hemoglobin < 10.0:
	scores["Anemia"] += 0.2
	if mcv < 80:
	scores["Anemia"] += 0.2

	# Heart disease indicators
	cholesterol = biomarkers.get("Cholesterol", 0)
	troponin = biomarkers.get("Troponin", 0)
	ldl = biomarkers.get("LDL", 0)
	if cholesterol > 240:
	scores["Heart Disease"] += 0.3
	if troponin > 0.04:
	scores["Heart Disease"] += 0.6
	if ldl > 190:
	scores["Heart Disease"] += 0.2

	# Thrombocytopenia indicators
	platelets = biomarkers.get("Platelets", 0)
	if platelets < 150000:
	scores["Thrombocytopenia"] += 0.6
	if platelets < 50000:
	scores["Thrombocytopenia"] += 0.3

	# Thalassemia indicators (complex, simplified here)
	if mcv < 80 and hemoglobin < 12.0:
	scores["Thalassemia"] += 0.4

	# Find top prediction
	top_disease = max(scores, key=scores.get)
	confidence = scores[top_disease]

	# Ensure at least 0.5 confidence
	if confidence < 0.5:
	confidence = 0.5
	top_disease = "Diabetes" # Default

	# Normalize probabilities to sum to 1.0
	total = sum(scores.values())
	if total > 0:
	probabilities = {k: v/total for k, v in scores.items()}
	else:
	probabilities = scores

	return {
	"disease": top_disease,
	"confidence": confidence,
	"probabilities": probabilities
	}


	def predict_disease_llm(biomarkers: Dict[str, float], patient_context: Dict) -> Dict[str, Any]:
	"""
	Use LLM to predict most likely disease based on biomarker pattern.
	Falls back to rule-based if LLM fails.
	"""
	try:
	print(f" [DEBUG] Predicting for biomarkers: {biomarkers}")
	llm = get_chat_model(temperature=0.0)

	prompt = f"""You are a medical AI assistant. Based on these biomarker values,
	predict the most likely disease from: Diabetes, Anemia, Heart Disease, Thrombocytopenia, Thalassemia.

	Biomarkers:
	{json.dumps(biomarkers, indent=2)}

	Patient Context:
	{json.dumps(patient_context, indent=2)}

	Return ONLY valid JSON (no other text):
	{{
	"disease": "Disease Name",
	"confidence": 0.85,
	"probabilities": {{
	"Diabetes": 0.85,
	"Anemia": 0.08,
	"Heart Disease": 0.04,
	"Thrombocytopenia": 0.02,
	"Thalassemia": 0.01
	}}
	}}
	"""

	response = llm.invoke(prompt)
	content = response.content.strip()
	print(f" [DEBUG] Prediction LLM response: {content[:200]}...")

	# Try to extract JSON if wrapped in markdown
	if "```json" in content:
	content = content.split("```json")[1].split("```")[0].strip()
	elif "```" in content:
	content = content.split("```")[1].split("```")[0].strip()

	prediction = json.loads(content)

	# Validate required fields
	if "disease" in prediction and "confidence" in prediction and "probabilities" in prediction:
	print(f" [DEBUG] LLM prediction successful: {prediction['disease']} ({prediction['confidence']:.0%})")
	return prediction
	else:
	raise ValueError("Invalid prediction format")

	except Exception as e:
	print(f"⚠️ LLM prediction failed ({e}), using rule-based fallback")
	import traceback
	traceback.print_exc()
	return predict_disease_simple(biomarkers)


	# ============================================================================
	# Component 3: Conversational Formatter
	# ============================================================================

	def format_conversational(result: Dict[str, Any], user_name: str = "there") -> str:
	"""
	Format technical JSON output into conversational response.
	"""
	# Extract key information
	summary = result.get("patient_summary", {})
	prediction = result.get("prediction_explanation", {})
	recommendations = result.get("clinical_recommendations", {})
	confidence = result.get("confidence_assessment", {})
	alerts = result.get("safety_alerts", [])

	disease = prediction.get("primary_disease", "Unknown")
	conf_score = prediction.get("confidence", 0.0)

	# Build conversational response
	response = []

	# 1. Greeting and main finding
	response.append(f"Hi {user_name}! 👋\n")
	response.append(f"Based on your biomarkers, I analyzed your results.\n")

	# 2. Primary diagnosis with confidence
	emoji = "🔴" if conf_score >= 0.8 else "🟡" if conf_score >= 0.6 else "🟢"
	response.append(f"{emoji} Primary Finding: {disease}")
	response.append(f" Confidence: {conf_score:.0%}\n")

	# 3. Critical safety alerts (if any)
	critical_alerts = [a for a in alerts if a.get("severity") == "CRITICAL"]
	if critical_alerts:
	response.append("⚠️ IMPORTANT SAFETY ALERTS:")
	for alert in critical_alerts[:3]: # Show top 3
	response.append(f" • {alert.get('biomarker', 'Unknown')}: {alert.get('message', '')}")
	response.append(f" → {alert.get('action', 'Consult healthcare provider')}")
	response.append("")

	# 4. Key drivers explanation
	key_drivers = prediction.get("key_drivers", [])
	if key_drivers:
	response.append("🔍 Why this prediction?")
	for driver in key_drivers[:3]: # Top 3 drivers
	biomarker = driver.get("biomarker", "")
	value = driver.get("value", "")
	explanation = driver.get("explanation", "")
	# Truncate long explanations
	if len(explanation) > 150:
	explanation = explanation[:147] + "..."
	response.append(f" • {biomarker} ({value}): {explanation}")
	response.append("")

	# 5. What to do next (immediate actions)
	immediate = recommendations.get("immediate_actions", [])
	if immediate:
	response.append("✅ What You Should Do:")
	for i, action in enumerate(immediate[:3], 1):
	response.append(f" {i}. {action}")
	response.append("")

	# 6. Lifestyle recommendations
	lifestyle = recommendations.get("lifestyle_changes", [])
	if lifestyle:
	response.append("🌱 Lifestyle Recommendations:")
	for i, change in enumerate(lifestyle[:3], 1):
	response.append(f" {i}. {change}")
	response.append("")

	# 7. Disclaimer
	response.append("ℹ️ Important: This is an AI-assisted analysis, NOT medical advice.")
	response.append(" Please consult a healthcare professional for proper diagnosis and treatment.\n")

	return "\n".join(response)


	# ============================================================================
	# Component 4: Helper Functions
	# ============================================================================

	def print_biomarker_help():
	"""Print list of supported biomarkers"""
	print("\n📋 Supported Biomarkers (24 total):")
	print("\n🩸 Blood Cells:")
	print(" • Hemoglobin, Platelets, WBC, RBC, Hematocrit, MCV, MCH, MCHC")
	print("\n🔬 Metabolic:")
	print(" • Glucose, Cholesterol, Triglycerides, HbA1c, LDL, HDL, Insulin, BMI")
	print("\n❤️ Cardiovascular:")
	print(" • Heart Rate, Systolic BP, Diastolic BP, Troponin, C-reactive Protein")
	print("\n🏥 Organ Function:")
	print(" • ALT, AST, Creatinine")
	print("\nExample: 'My glucose is 140, HbA1c is 7.5, cholesterol is 220'\n")


	def run_example_case(guild):
	"""Run example diabetes patient case"""
	print("\n📋 Running Example: Type 2 Diabetes Patient")
	print(" 52-year-old male with elevated glucose and HbA1c\n")

	example_biomarkers = {
	"Glucose": 185.0,
	"HbA1c": 8.2,
	"Cholesterol": 235.0,
	"Triglycerides": 210.0,
	"HDL": 38.0,
	"LDL": 160.0,
	"Hemoglobin": 13.5,
	"Platelets": 220000,
	"WBC": 7500,
	"Systolic BP": 145,
	"Diastolic BP": 92
	}

	prediction = {
	"disease": "Diabetes",
	"confidence": 0.87,
	"probabilities": {
	"Diabetes": 0.87,
	"Heart Disease": 0.08,
	"Anemia": 0.03,
	"Thrombocytopenia": 0.01,
	"Thalassemia": 0.01
	}
	}

	patient_input = PatientInput(
	biomarkers=example_biomarkers,
	model_prediction=prediction,
	patient_context={"age": 52, "gender": "male", "bmi": 31.2}
	)

	print("🔄 Running analysis...\n")
	result = guild.run(patient_input)

	response = format_conversational(result, "there")
	print("\n" + "="*70)
	print("🤖 RAG-BOT:")
	print("="*70)
	print(response)
	print("="*70 + "\n")


	def save_report(result: Dict, biomarkers: Dict):
	"""Save detailed JSON report to file"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	disease = result.get("prediction_explanation", {}).get("primary_disease", "unknown")
	disease_safe = disease.replace(' ', '_').replace('/', '_')
	filename = f"report_{disease_safe}_{timestamp}.json"

	output_dir = Path("data/chat_reports")
	output_dir.mkdir(parents=True, exist_ok=True)

	filepath = output_dir / filename

	# Add biomarkers to report
	report = {
	"timestamp": timestamp,
	"biomarkers_input": biomarkers,
	"analysis_result": result
	}

	with open(filepath, 'w') as f:
	json.dump(report, f, indent=2)

	print(f"✅ Report saved to: {filepath}\n")


	# ============================================================================
	# Main Chat Interface
	# ============================================================================

	def chat_interface():
	"""
	Main interactive CLI chatbot for MediGuard AI RAG-Helper.
	"""
	# Print welcome banner
	print("\n" + "="*70)
	print("🤖 MediGuard AI RAG-Helper - Interactive Chat")
	print("="*70)
	print("\nWelcome! I can help you understand your blood test results.\n")
	print("You can:")
	print(" 1. Describe your biomarkers (e.g., 'My glucose is 140, HbA1c is 7.5')")
	print(" 2. Type 'example' to see a sample diabetes case")
	print(" 3. Type 'help' for biomarker list")
	print(" 4. Type 'quit' to exit\n")
	print("="*70 + "\n")

	# Initialize guild (one-time setup)
	print("🔧 Initializing medical knowledge system...")
	try:
	guild = create_guild()
	print("✅ System ready!\n")
	except Exception as e:
	print(f"❌ Failed to initialize system: {e}")
	print("\nMake sure:")
	print(" • Ollama is running (ollama serve)")
	print(" • Vector store exists (run: python src/pdf_processor.py)")
	print(" • Models are pulled (ollama pull llama3.1:8b-instruct)")
	return

	# Main conversation loop
	conversation_history = []
	user_name = "there"

	while True:
	try:
	# Get user input
	user_input = input("You: ").strip()

	if not user_input:
	continue

	# Handle special commands
	if user_input.lower() in ['quit', 'exit', 'q']:
	print("\n👋 Thank you for using MediGuard AI. Stay healthy!")
	break

	if user_input.lower() == 'help':
	print_biomarker_help()
	continue

	if user_input.lower() == 'example':
	run_example_case(guild)
	continue

	# Extract biomarkers from natural language
	print("\n🔍 Analyzing your input...")
	biomarkers, patient_context = extract_biomarkers(user_input)

	if not biomarkers:
	print("❌ I couldn't find any biomarker values in your message.")
	print(" Try: 'My glucose is 140 and HbA1c is 7.5'")
	print(" Or type 'help' to see all biomarkers I can analyze.\n")
	continue

	print(f"✅ Found {len(biomarkers)} biomarker(s): {', '.join(biomarkers.keys())}")

	# Check if we have enough biomarkers (minimum 2)
	if len(biomarkers) < 2:
	print("⚠️ I need at least 2 biomarkers for a reliable analysis.")
	print(" Can you provide more values?\n")
	continue

	# Generate disease prediction
	print("🧠 Predicting likely condition...")
	prediction = predict_disease_llm(biomarkers, patient_context)
	print(f"✅ Predicted: {prediction['disease']} ({prediction['confidence']:.0%} confidence)")
	print(f" [DEBUG] Full prediction: {prediction}")

	# Create PatientInput
	patient_input = PatientInput(
	biomarkers=biomarkers,
	model_prediction=prediction,
	patient_context=patient_context if patient_context else {"source": "chat"}
	)

	print(f" [DEBUG] PatientInput created:")
	print(f" - Biomarkers: {patient_input.biomarkers}")
	print(f" - Prediction: {patient_input.model_prediction}")
	print(f" - Context: {patient_input.patient_context}")

	# Run full RAG workflow
	print("📚 Consulting medical knowledge base...")
	print(" (This may take 15-25 seconds...)\n")

	result = guild.run(patient_input)

	# Format conversational response
	response = format_conversational(result, user_name)

	# Display response
	print("\n" + "="*70)
	print("🤖 RAG-BOT:")
	print("="*70)
	print(response)
	print("="*70 + "\n")

	# Save to history
	conversation_history.append({
	"user_input": user_input,
	"biomarkers": biomarkers,
	"prediction": prediction,
	"result": result
	})

	# Ask if user wants to save report
	save_choice = input("💾 Save detailed report to file? (y/n): ").strip().lower()
	if save_choice == 'y':
	save_report(result, biomarkers)

	print("\nYou can:")
	print(" • Enter more biomarkers for a new analysis")
	print(" • Type 'quit' to exit\n")

	except KeyboardInterrupt:
	print("\n\n👋 Interrupted. Thank you for using MediGuard AI!")
	break
	except Exception as e:
	print(f"\n❌ Analysis failed: {e}")
	print("\nThis might be due to:")
	print(" • Ollama not running (start with: ollama serve)")
	print(" • Insufficient system memory")
	print(" • Invalid biomarker values")
	print("\nTry again or type 'quit' to exit.\n")
	continue


	# ============================================================================
	# Entry Point
	# ============================================================================

	if __name__ == "__main__":
	try:
	chat_interface()
	except Exception as e:
	print(f"\n❌ Fatal error: {e}")
	print("Please check your setup and try again.")
	sys.exit(1)