Spaces:

Donlagon007
/

rag

Sleeping

App Files Files Community

rag / app.py

Donlagon007

Update app.py

ec441c7 verified 5 months ago

raw

history blame contribute delete

21.8 kB


	def _simple_response(self, scenario_params: dict, stats: dict) -> str:
	"""Simple fallback response without LLM"""
	if not scenario_params:
	return f"""Based on the analysis results, colorectal cancer screening reduces cancer incidence from {stats['cancer_rate_noscreening']:.2%} to {stats['cancer_rate_screening']:.2%} ({stats.get('cancer_reduction_pct', 0):.1f}% reduction) and advanced cancer from {stats['advca_rate_noscreening']:.2%} to {stats['advca_rate_screening']:.2%} ({stats.get('advca_reduction_pct', 0):.1f}% reduction). The cancer prevention effect is {stats['cancer_prevented_rate']:.2%} and the advanced cancer prevention rate is {stats['advca_prevented_rate']:.2%}.

	If you have specific parameter adjustments in mind, please provide them for a detailed comparison of the screening effectiveness with the adjusted parameters."""

	response = f"""
	📊 Analysis Results (Based on {stats['total_cases']:,} real cases with fixed random seed):

	🎯 Cancer Incidence:
	- No screening: {stats['cancer_rate_noscreening']:.2%}
	- With screening: {stats['cancer_rate_screening']:.2%}
	- Reduction: {stats.get('cancer_reduction_pct', 0):.1f}%

	🎯 Advanced Cancer Incidence:
	- No screening: {stats['advca_rate_noscreening']:.2%}
	- With screening: {stats['advca_rate_screening']:.2%}
	- Reduction: {stats.get('advca_reduction_pct', 0):.1f}%

	🔬 Prevention Effects:
	- Cancer prevention effect: {stats['cancer_prevented_rate']:.2%}
	- Advanced cancer prevention: {stats['advca_prevented_rate']:.2%}
	"""
	if scenario_params:
	response += f"\n🔧 Parameter Adjustments:\n"
	for param, value in scenario_params.items():
	if param == 'screening_interval':
	response += f" - {param}: {value} years\n"
	else:
	response += f" - {param}: {value:.1%}\n"

	return response # Colorectal Cancer Screening AI Analysis System - English Version


	import pandas as pd
	import numpy as np
	import random
	import re
	import json
	import os
	from typing import Dict, List, Optional
	from langchain_openai import ChatOpenAI
	from langchain.schema import SystemMessage, HumanMessage
	import pyreadstat
	import requests


	# Set API Key
	os.environ["OPENAI_API_KEY"] ="OPENAI_API_KEY" # 請替換為您的實際 API Key


	class ScreeningSimulator:
	def __init__(self):
	self.params = {
	'participation_rate': 0.40,
	'sensitivity_state2': 0.44,
	'sensitivity_state3': 0.52,
	'sensitivity_state4': 0.61,
	'sensitivity_state5': 0.70,
	'referral_rate': 0.70,
	'treatment_success': 0.90,
	'screening_interval': 2 # years between screenings
	}
	self.data = None
	self.results = None
	# Fixed random seeds for reproducible results
	self.random_seed = 246810
	self.numpy_seed = 13579

	def load_data(self):
	"""Load and sample the screening data"""
	# Set random seed for reproducible sampling
	np.random.seed(self.numpy_seed)
	random.seed(self.random_seed)

	DROPBOX_URL = "https://www.dropbox.com/scl/fi/p3n3g7h3wifzs26y0ylah/matched_with_error_all_noid.sas7bdat?rlkey=il8x2ur5xf1n5ivs84rwdlyto&st=jk0vcb2f&dl=1"
	LOCAL_FILENAME = "matched_with_error_all_noid.sas7bdat"

	def download_from_dropbox(url, out_path):
	if not os.path.exists(out_path):
	print("Downloading file from Dropbox...")
	with requests.get(url, stream=True) as r:
	r.raise_for_status()
	with open(out_path, "wb") as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	print("Download complete.")
	else:
	print("File already exists, skipping download.")

	download_from_dropbox(DROPBOX_URL, LOCAL_FILENAME)

	# data_path = r"matched_with_error_all_noid.sas7bdat"
	# data_path = hf_hub_download(repo_id="Donlagon007/iaccs2025", filename="matched_with_error_all_noid.sas7bdat")
	full_data, meta = pyreadstat.read_sas7bdat(LOCAL_FILENAME)
	sample_size = int(len(full_data) * 0.01)
	self.data = full_data.sample(n=sample_size, random_state=self.random_seed).reset_index(drop=True)
	return True

	def simulate_screening(self, custom_params=None):
	"""Simulate the screening process for each individual"""
	# Set random seed for reproducible simulation
	random.seed(self.random_seed)
	np.random.seed(self.numpy_seed)

	if custom_params:
	params = {self.params, custom_params}
	else:
	params = self.params

	results = []

	for idx, row in self.data.iterrows():
	# Extract vstate50-vstate80 (health states by age)
	vstate = {}
	for age in range(50, 81):
	col = f'vstate{age}'
	if col in row.index and pd.notna(row[col]):
	vstate[age] = int(row[col])
	else:
	vstate[age] = 1

	# Initialize intervention state (same as natural history initially)
	istate = vstate.copy()

	# Screening variables
	participated = False
	detected = False
	had_referral = False
	intervention_success = False
	intervention_age = None
	intervention_state = None

	# Determine if this person never participates (0% for now)
	never_participate = random.random() < 0

	# If not a never-participant, go through screening process
	if not never_participate:
	# Screening process: starts at age 50, with custom interval, until age 74
	screening_interval = int(params['screening_interval'])
	for screening_age in range(50, 75, screening_interval):
	current_state = istate[screening_age]

	# Skip if already dead or in terminal state
	if current_state in [6, 7]:
	continue

	# Check if participates in this screening round
	if random.random() <= params['participation_rate']:
	participated = True

	# Detection probability based on current state
	detection_prob = 0
	if current_state in [2, 8]: # Adenoma states
	detection_prob = params['sensitivity_state2']
	elif current_state in [3, 9]: # Advanced adenoma states
	detection_prob = params['sensitivity_state3']
	elif current_state == 4: # Cancer
	detection_prob = params['sensitivity_state4']
	elif current_state == 5: # Advanced cancer
	detection_prob = params['sensitivity_state5']

	if detection_prob > 0 and random.random() <= detection_prob:
	detected = True

	# Referral for further investigation (colonoscopy)
	if random.random() <= params['referral_rate']:
	had_referral = True

	# Treatment success
	if random.random() <= params['treatment_success']:
	intervention_success = True
	intervention_age = screening_age
	intervention_state = current_state

	# Reset all future states to healthy (state 1)
	for treat_age in range(screening_age, 81):
	istate[treat_age] = 1
	break
	# Only break if intervention was successful
	if intervention_success:
	break

	# Calculate outcome indicators
	cancer_noscreening = any(state in [4, 5, 6, 7] for state in vstate.values())
	cancer_screening = any(state in [4, 5, 6, 7] for state in istate.values())
	advca_noscreening = any(state in [5, 7] for state in vstate.values())
	advca_screening = any(state in [5, 7] for state in istate.values())

	# Overdiagnosis calculation - two types
	detected_state = None
	if intervention_success:
	detected_state = vstate[intervention_age]

	overdiag_adenoma = False # Overdiagnosis of adenoma (state 8)
	overdiag_advanced_adenoma = False # Overdiagnosis of advanced adenoma (state 9)
	adenoma_detected = False # Any adenoma detected (states 2,3,8,9)
	advanced_adenoma_detected = False # Advanced adenoma detected (states 3,9)

	if intervention_success and detected_state is not None:
	if detected_state in [2, 3, 8, 9]: # Adenoma detected
	adenoma_detected = True
	if detected_state in [8, 9]: # Overdiagnosis adenoma (8,9)
	overdiag_adenoma = True

	if detected_state in [3, 9]: # Advanced adenoma detected
	advanced_adenoma_detected = True
	if detected_state == 9: # Overdiagnosis advanced adenoma (9)
	overdiag_advanced_adenoma = True

	results.append({
	'cancer_noscreening': cancer_noscreening,
	'cancer_screening': cancer_screening,
	'advca_noscreening': advca_noscreening,
	'advca_screening': advca_screening,
	'participated': participated,
	'detected': detected,
	'had_referral': had_referral,
	'intervention_success': intervention_success,
	'adenoma_detected': adenoma_detected,
	'advanced_adenoma_detected': advanced_adenoma_detected,
	'overdiag_adenoma': overdiag_adenoma,
	'overdiag_advanced_adenoma': overdiag_advanced_adenoma
	})

	self.results = pd.DataFrame(results)
	return self.results

	def analyze_results(self):
	"""Analyze simulation results and calculate key metrics"""
	df = self.results

	# Overdiagnosis rate calculation - two types
	adenoma_detected_count = df['adenoma_detected'].sum()
	advanced_adenoma_detected_count = df['advanced_adenoma_detected'].sum()

	overdiag_adenoma_count = df['overdiag_adenoma'].sum()
	overdiag_advanced_adenoma_count = df['overdiag_advanced_adenoma'].sum()

	overdiag_adenoma_rate = overdiag_adenoma_count / adenoma_detected_count if adenoma_detected_count > 0 else 0
	overdiag_advanced_adenoma_rate = overdiag_advanced_adenoma_count / advanced_adenoma_detected_count if advanced_adenoma_detected_count > 0 else 0

	stats = {
	'total_cases': len(df),
	'cancer_rate_noscreening': df['cancer_noscreening'].mean(),
	'cancer_rate_screening': df['cancer_screening'].mean(),
	'cancer_prevented_rate': (df['cancer_noscreening'] & ~df['cancer_screening']).mean(),
	'advca_rate_noscreening': df['advca_noscreening'].mean(),
	'advca_rate_screening': df['advca_screening'].mean(),
	'advca_prevented_rate': (df['advca_noscreening'] & ~df['advca_screening']).mean(),
	'participation_rate': df['participated'].mean(),
	'detection_rate': df['detected'].mean(),
	'referral_rate': df['had_referral'].mean(),
	'intervention_success_rate': df['intervention_success'].mean(),

	# Two types of overdiagnosis rates
	'overdiag_adenoma_rate': overdiag_adenoma_rate,
	'overdiag_advanced_adenoma_rate': overdiag_advanced_adenoma_rate,

	# Detailed counts
	'adenoma_detected_count': adenoma_detected_count,
	'advanced_adenoma_detected_count': advanced_adenoma_detected_count,
	'overdiag_adenoma_count': overdiag_adenoma_count,
	'overdiag_advanced_adenoma_count': overdiag_advanced_adenoma_count
	}

	# Relative reduction percentages
	if stats['cancer_rate_noscreening'] > 0:
	stats['cancer_reduction_pct'] = (stats['cancer_rate_noscreening'] - stats['cancer_rate_screening']) / stats[
	'cancer_rate_noscreening'] * 100
	stats['cancer_reduction_rate'] = 1 - (stats['cancer_rate_screening'] / stats['cancer_rate_noscreening'])
	if stats['advca_rate_noscreening'] > 0:
	stats['advca_reduction_pct'] = (stats['advca_rate_noscreening'] - stats['advca_rate_screening']) / stats[
	'advca_rate_noscreening'] * 100
	stats['advca_reduction_rate'] = 1 - (stats['advca_rate_screening'] / stats['advca_rate_noscreening'])

	return stats


	class CancerScreeningLLMInterface:
	def __init__(self, simulator: ScreeningSimulator):
	self.simulator = simulator
	self.llm = ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo", max_tokens=1000)

	def parse_user_query(self, user_input: str) -> dict:
	"""Parse user question and extract screening parameter adjustments"""
	prompt = f"""
	Please analyze the user's question and extract screening parameter adjustments:

	User question: {user_input}

	Parameter descriptions:
	1. participation_rate: Currently 40%
	2. referral_rate: Currently 70%
	3. treatment_success: Currently 90%
	4. sensitivity_state2/3/4/5: Currently 44%/52%/61%/70%
	5. screening_interval: Currently 2 years (between screenings)

	Return only JSON format with parameters the user wants to adjust.
	Example: {{"participation_rate": 0.60, "referral_rate": 0.90, "screening_interval": 1}}
	If no parameter adjustments, return: {{}}
	"""

	try:
	response = self.llm.invoke([HumanMessage(content=prompt)])
	json_match = re.search(r'\{[^}]*\}', response.content)
	if json_match:
	return json.loads(json_match.group())
	return {}
	except:
	return self._simple_parse(user_input)

	def _simple_parse(self, user_input: str) -> dict:
	"""Simple fallback parsing using regex"""
	user_input = user_input.lower()
	params = {}
	numbers = re.findall(r'(\d+(?:\.\d+)?)', user_input)

	if ('participation' in user_input or 'participate' in user_input) and numbers:
	value = float(numbers[0])
	params['participation_rate'] = value / 100 if value > 1 else value

	if ('referral' in user_input or 'colonoscopy' in user_input or 'enhance' in user_input) and numbers:
	value = float(numbers[0])
	params['referral_rate'] = value / 100 if value > 1 else value

	if ('treatment' in user_input or 'success' in user_input) and numbers:
	value = float(numbers[0])
	params['treatment_success'] = value / 100 if value > 1 else value

	if ('sensitivity' in user_input or 'detection' in user_input) and numbers:
	value = float(numbers[0])
	sens_value = value / 100 if value > 1 else value
	# Apply to all sensitivity parameters
	params['sensitivity_state2'] = sens_value
	params['sensitivity_state3'] = sens_value
	params['sensitivity_state4'] = sens_value
	params['sensitivity_state5'] = sens_value

	if (
	'interval' in user_input or 'frequency' in user_input or 'yearly' in user_input or 'annual' in user_input) and numbers:
	value = float(numbers[0])
	# Handle common expressions
	if 'annual' in user_input or 'yearly' in user_input or 'every year' in user_input:
	params['screening_interval'] = 1
	elif 'biennial' in user_input or 'every 2 years' in user_input or 'two years' in user_input:
	params['screening_interval'] = 2
	else:
	params['screening_interval'] = int(value)

	return params

	def generate_response(self, user_input: str, scenario_params: dict, stats: dict) -> str:
	"""Generate AI response based on analysis results"""

	# For basic effectiveness questions without parameter changes
	if not scenario_params:
	return f"""Based on the analysis results, colorectal cancer screening reduces cancer incidence from {stats['cancer_rate_noscreening']:.2%} to {stats['cancer_rate_screening']:.2%} ({stats.get('cancer_reduction_pct', 0):.1f}% reduction) and advanced cancer from {stats['advca_rate_noscreening']:.2%} to {stats['advca_rate_screening']:.2%} ({stats.get('advca_reduction_pct', 0):.1f}% reduction). The cancer prevention effect is {stats['cancer_prevented_rate']:.2%} and the advanced cancer prevention rate is {stats['advca_prevented_rate']:.2%}.

	If you have specific parameter adjustments in mind, please provide them for a detailed comparison of the screening effectiveness with the adjusted parameters."""

	# For questions with parameter adjustments, provide comparison
	try:
	# Run baseline simulation for comparison
	baseline_results = self.simulator.simulate_screening()
	baseline_stats = self.simulator.analyze_results()

	response = f"""With the parameter adjustment of {scenario_params}:

	Cancer Incidence:
	- Baseline: {baseline_stats['cancer_rate_noscreening']:.2%} → {baseline_stats['cancer_rate_screening']:.2%} ({baseline_stats.get('cancer_reduction_pct', 0):.1f}% reduction)
	- Adjusted: {stats['cancer_rate_noscreening']:.2%} → {stats['cancer_rate_screening']:.2%} ({stats.get('cancer_reduction_pct', 0):.1f}% reduction)
	- Change in cancer rate: {(stats['cancer_rate_screening'] - baseline_stats['cancer_rate_screening']) * 100:+.2f} percentage points

	Advanced Cancer:
	- Baseline: {baseline_stats['advca_rate_noscreening']:.2%} → {baseline_stats['advca_rate_screening']:.2%} ({baseline_stats.get('advca_reduction_pct', 0):.1f}% reduction)
	- Adjusted: {stats['advca_rate_noscreening']:.2%} → {stats['advca_rate_screening']:.2%} ({stats.get('advca_reduction_pct', 0):.1f}% reduction)
	- Change in advanced cancer rate: {(stats['advca_rate_screening'] - baseline_stats['advca_rate_screening']) * 100:+.2f} percentage points

	Key Changes:
	- Cancer prevention effect: {baseline_stats['cancer_prevented_rate']:.2%} → {stats['cancer_prevented_rate']:.2%} ({(stats['cancer_prevented_rate'] - baseline_stats['cancer_prevented_rate']) * 100:+.2f} pp)

	"""
	return response

	except:
	return self._simple_response(scenario_params, stats)

	def _simple_response(self, scenario_params: Dict, stats: Dict) -> str:
	"""Simple fallback response without LLM"""
	response = f"""
	📊 Analysis Results (Based on {stats['total_cases']:,} real cases):

	🎯 Cancer Incidence:
	- Natural history: {stats['cancer_rate_natural']:.2%}
	- With screening: {stats['cancer_rate_intervention']:.2%}
	- Prevention effect: {stats['cancer_prevented_rate']:.2%}

	🎯 Advanced Cancer Incidence:
	- Natural history: {stats['advanced_rate_natural']:.2%}
	- With screening: {stats['advanced_rate_intervention']:.2%}
	- Prevention effect: {stats['advanced_prevented_rate']:.2%}

	🔬 Screening Process:
	- Participation rate: {stats['participation_rate']:.2%}
	- Successful intervention rate: {stats['intervention_success_rate']:.2%}
	"""
	if scenario_params:
	response += f"\n🔧 Parameter Adjustments:\n"
	for param, value in scenario_params.items():
	response += f" - {param}: {value:.1%}\n"

	return response

	def answer_question(self, user_input: str) -> str:
	"""Main method to answer user questions"""
	scenario_params = self.parse_user_query(user_input)

	# Debug: Show what parameters were parsed
	if scenario_params:
	print(f"🔧 Parsed parameters: {scenario_params}")

	self.simulator.simulate_screening(scenario_params)
	stats = self.simulator.analyze_results()

	if stats is None:
	return "Analysis failed, please try again."

	return self.generate_response(user_input, scenario_params, stats)







	import gradio as gr

	# --- Assume your classes are defined elsewhere ---
	# from your_module import ScreeningSimulator, CancerScreeningLLMInterface

	# Create simulator and interface only once, to persist between calls
	simulator = ScreeningSimulator()
	simulator.load_data()
	llm_interface = CancerScreeningLLMInterface(simulator)

	def answer_question_gradio(question):
	if not question.strip():
	return "Please enter a question."
	try:
	answer = llm_interface.answer_question(question)
	return answer
	except Exception as e:
	return f"❌ Error: {e}"

	# Create the Gradio Interface
	iface = gr.Interface(
	fn=answer_question_gradio,
	inputs=gr.Textbox(lines=5, label="🙋‍♀️ Please enter your question"),
	outputs=gr.Textbox(lines=5, label="🤖 AI Analyst"),
	title="🔬 Colorectal Cancer Screening AI Analysis System",
	description="Ask a question about the effectiveness of CRC screening in Taiwan—for example, regarding screening intervals, attendance rates, referral rates, or treatment success rates."
	)

	if __name__ == "__main__":
	iface.launch(share=True)