Spaces:

mwmathis
/

ClinicalTrialMatcher_HF

Running

App Files Files Community

ClinicalTrialMatcher_HF / app.py

mwmathis

default country changed

68f7bb3 verified about 2 months ago

raw

history blame contribute delete

25.3 kB

	"""
	Gradio app for Clinical Trial Matcher
	For Hugging Face Spaces deployment
	"""
	import gradio as gr
	import requests
	import re
	import os
	from typing import Tuple, Optional

	# ClinicalTrials.gov API base URL
	CLINICALTRIALS_API_BASE = "https://clinicaltrials.gov/api/v2/studies"

	# Import LLM service (optional - will gracefully degrade if not available)
	LLM_AVAILABLE = False
	get_llm_service = None

	# Check environment variables
	USE_HF_API = os.environ.get('USE_HF_API', 'false').lower() == 'true'
	HF_TOKEN = os.environ.get('HUGGINGFACE_API_TOKEN', '')
	print(f"Environment check - USE_HF_API: {USE_HF_API}, HF_TOKEN set: {bool(HF_TOKEN)}")

	try:
	from llm_service import get_llm_service
	LLM_AVAILABLE = True
	print("LLM service imported successfully.")
	except ImportError as e:
	print(f"LLM service not available. Ranking will be disabled. Error: {str(e)}")
	except Exception as e:
	print(f"Error importing LLM service: {str(e)}")
	import traceback
	traceback.print_exc()


	def search_trials(query: str, country: str, status: str, ranking_terms: str = "") -> Tuple[str, int, Optional[str]]:
	"""
	Search clinical trials and return formatted results

	Returns:
	tuple: (results_html, total_count, llm_model_name)
	"""
	if not query:
	return "Please enter search keywords.", 0, None

	try:
	# Build API request parameters
	params = {
	'format': 'json',
	'pageSize': 20
	}

	# Build query filter
	query_parts = []
	if query:
	query_parts.append(query)

	if country:
	query_parts.append(f'AREA[LocationCountry]{country}')

	if status:
	query_parts.append(f'AREA[OverallStatus]{status}')

	if query_parts:
	params['query.term'] = ' AND '.join(query_parts)

	# Make request to ClinicalTrials.gov API
	response = requests.get(CLINICALTRIALS_API_BASE, params=params, timeout=30)
	response.raise_for_status()

	data = response.json()

	# Extract and format relevant information
	studies = []
	if 'studies' in data:
	for study in data['studies']:
	protocol_section = study.get('protocolSection', {})
	identification = protocol_section.get('identificationModule', {})
	nct_id = identification.get('nctId', '')
	organization = identification.get('organization', {})
	sponsor = organization.get('fullName', '') if organization else ''
	status_module = protocol_section.get('statusModule', {})
	description = protocol_section.get('descriptionModule', {})
	conditions = protocol_section.get('conditionsModule', {})
	locations_module = protocol_section.get('contactsLocationsModule', {})

	# Fetch full study details to get ALL locations and eligibility criteria
	locations = []
	inclusion_criteria = []
	exclusion_criteria = []
	detail_data = None
	if nct_id:
	try:
	detail_url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}"
	detail_response = requests.get(detail_url, params={'format': 'json'}, timeout=60)
	detail_response.raise_for_status()
	detail_data = detail_response.json()
	detail_protocol = detail_data.get('protocolSection', {})
	detail_locations_module = detail_protocol.get('contactsLocationsModule', {})
	if 'locations' in detail_locations_module:
	all_locations = detail_locations_module['locations']
	if isinstance(all_locations, list) and len(all_locations) > 0:
	for loc in all_locations:
	locations.append({
	'facility': loc.get('facility', ''),
	'city': loc.get('city', ''),
	'country': loc.get('country', '')
	})
	except Exception:
	pass

	# Extract eligibility criteria from detail data if available
	if detail_data:
	detail_protocol = detail_data.get('protocolSection', {})
	eligibility_module = detail_protocol.get('eligibilityModule', {})
	eligibility_text = eligibility_module.get('eligibilityCriteria', '')

	if eligibility_text:
	# Parse inclusion and exclusion criteria with multiple format support
	inclusion_patterns = [
	r'(?:Key\s+)?Inclusion\s+Criteria[^:\n]:?\s(.?)(?=(?:Key\s+)?Exclusion\s+Criteria[^:\n]:?\|$)',
	r'Inclusion\s+Criteria[^:\n]:?\s(.?)(?=Exclusion\s+Criteria[^:\n]:?\|$)',
	]
	exclusion_patterns = [
	r'(?:Key\s+)?Exclusion\s+Criteria[^:\n]:?\s(.*?)$',
	r'Exclusion\s+Criteria[^:\n]:?\s(.*?)$',
	]

	inclusion_match = None
	exclusion_match = None
	for pattern in inclusion_patterns:
	inclusion_match = re.search(pattern, eligibility_text, re.IGNORECASE \| re.DOTALL)
	if inclusion_match:
	break

	for pattern in exclusion_patterns:
	exclusion_match = re.search(pattern, eligibility_text, re.IGNORECASE \| re.DOTALL)
	if exclusion_match:
	break

	if inclusion_match:
	inclusion_text = inclusion_match.group(1).strip()
	inclusion_lines = [line.strip() for line in inclusion_text.split('\n') if line.strip()]
	inclusion_criteria = [re.sub(r'^[\\-\•]\s', '', line).strip() for line in inclusion_lines if line.strip()]
	inclusion_criteria = [item for item in inclusion_criteria if item and len(item) > 3]

	if exclusion_match:
	exclusion_text = exclusion_match.group(1).strip()
	exclusion_lines = [line.strip() for line in exclusion_text.split('\n') if line.strip()]
	exclusion_criteria = [re.sub(r'^[\\-\•]\s', '', line).strip() for line in exclusion_lines if line.strip()]
	exclusion_criteria = [item for item in exclusion_criteria if item and len(item) > 3]
	else:
	# Try to get eligibility from search results as fallback
	eligibility_module = protocol_section.get('eligibilityModule', {})
	eligibility_text = eligibility_module.get('eligibilityCriteria', '')
	if eligibility_text:
	inclusion_patterns = [
	r'(?:Key\s+)?Inclusion\s+Criteria[^:\n]:?\s(.?)(?=(?:Key\s+)?Exclusion\s+Criteria[^:\n]:?\|$)',
	r'Inclusion\s+Criteria[^:\n]:?\s(.?)(?=Exclusion\s+Criteria[^:\n]:?\|$)',
	]
	exclusion_patterns = [
	r'(?:Key\s+)?Exclusion\s+Criteria[^:\n]:?\s(.*?)$',
	r'Exclusion\s+Criteria[^:\n]:?\s(.*?)$',
	]

	inclusion_match = None
	exclusion_match = None
	for pattern in inclusion_patterns:
	inclusion_match = re.search(pattern, eligibility_text, re.IGNORECASE \| re.DOTALL)
	if inclusion_match:
	break

	for pattern in exclusion_patterns:
	exclusion_match = re.search(pattern, eligibility_text, re.IGNORECASE \| re.DOTALL)
	if exclusion_match:
	break

	if inclusion_match:
	inclusion_text = inclusion_match.group(1).strip()
	inclusion_lines = [line.strip() for line in inclusion_text.split('\n') if line.strip()]
	inclusion_criteria = [re.sub(r'^[\\-\•]\s', '', line).strip() for line in inclusion_lines if line.strip()]
	inclusion_criteria = [item for item in inclusion_criteria if item and len(item) > 3]

	if exclusion_match:
	exclusion_text = exclusion_match.group(1).strip()
	exclusion_lines = [line.strip() for line in exclusion_text.split('\n') if line.strip()]
	exclusion_criteria = [re.sub(r'^[\\-\•]\s', '', line).strip() for line in exclusion_lines if line.strip()]
	exclusion_criteria = [item for item in exclusion_criteria if item and len(item) > 3]

	# If detail fetch failed or returned no locations, use search results as fallback
	if len(locations) == 0 and 'locations' in locations_module:
	all_locations = locations_module.get('locations', [])
	if isinstance(all_locations, list):
	for loc in all_locations:
	locations.append({
	'facility': loc.get('facility', ''),
	'city': loc.get('city', ''),
	'country': loc.get('country', '')
	})

	# Sort locations: searched country first, then others
	if country and locations:
	search_country_lower = country.lower().strip()
	def location_sort_key(loc):
	loc_country = loc.get('country', '').lower().strip()
	if loc_country == search_country_lower:
	return 0
	elif search_country_lower in loc_country or loc_country in search_country_lower:
	return 1
	else:
	return 2
	locations.sort(key=location_sort_key)

	study_info = {
	'nctId': nct_id,
	'title': identification.get('officialTitle') or identification.get('briefTitle', ''),
	'sponsor': sponsor,
	'status': status_module.get('overallStatus', ''),
	'conditions': conditions.get('conditions', []),
	'briefSummary': description.get('briefSummary', ''),
	'locations': locations,
	'inclusionCriteria': inclusion_criteria,
	'exclusionCriteria': exclusion_criteria,
	'lastUpdateDate': status_module.get('lastUpdateSubmitDate', '')
	}
	studies.append(study_info)

	# Apply LLM-based ranking if ranking terms provided
	llm_model_name = None
	if ranking_terms and ranking_terms.strip():
	if not LLM_AVAILABLE:
	print("Warning: LLM service not available. Ranking disabled.")
	print(f" - LLM_AVAILABLE: {LLM_AVAILABLE}")
	print(f" - get_llm_service: {get_llm_service}")
	elif not get_llm_service:
	print("Warning: get_llm_service is None. Ranking disabled.")
	else:
	try:
	llm_service = get_llm_service()
	if llm_service:
	llm_model_name = llm_service.model_name
	print(f"Ranking studies with {llm_model_name} using terms: {ranking_terms}")
	studies = llm_service.rank_studies(studies, ranking_terms)
	print(f"Ranking completed. {len(studies)} studies ranked.")
	else:
	print("Warning: LLM service returned None. Ranking disabled.")
	print(f" - Check environment variables:")
	print(f" USE_HF_API: {os.environ.get('USE_HF_API', 'NOT SET')}")
	print(f" HUGGINGFACE_API_TOKEN: {'SET' if os.environ.get('HUGGINGFACE_API_TOKEN') else 'NOT SET'}")
	except Exception as e:
	print(f"Error during LLM ranking: {str(e)}")
	import traceback
	traceback.print_exc()

	# Format results as HTML
	if not studies:
	return "No studies found. Try different search criteria.", 0, None

	# Count how many studies were actually ranked (have relevance_score)
	ranked_count = sum(1 for s in studies if 'relevance_score' in s)
	total_before_ranking = len(studies)

	html_results = f"<h2>Found {len(studies)} studies</h2>"
	if ranking_terms and ranking_terms.strip() and llm_model_name:
	html_results += f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 8px; margin: 1rem 0;"><strong>🤖 Results ranked by {llm_model_name}</strong><br>Ranking terms: "{ranking_terms}"</div>'
	html_results += f'<p style="margin: 1rem 0; padding: 0.75rem; background: #f0f8ff; border-left: 3px solid #667eea; border-radius: 4px;"><strong>Ranking applied:</strong> Studies have been reordered by AI relevance to "{ranking_terms}". The model analyzed each study\'s title, summary, conditions, and inclusion criteria to determine how closely they match your ranking terms. Higher-ranked studies appear first.</p>'
	elif ranking_terms and ranking_terms.strip() and not llm_model_name:
	# Check what's wrong
	use_hf_api = os.environ.get('USE_HF_API', 'false').lower() == 'true'
	hf_token = os.environ.get('HUGGINGFACE_API_TOKEN', '')
	error_msg = "LLM service is not available."
	if not use_hf_api:
	error_msg += " Set USE_HF_API=true in your Space secrets."
	if not hf_token:
	error_msg += " Set HUGGINGFACE_API_TOKEN in your Space secrets."
	html_results += f'<div style="background: #fff3cd; color: #856404; padding: 1rem; border-radius: 8px; margin: 1rem 0; border: 1px solid #ffc107;"><strong>⚠️ Ranking not applied:</strong> {error_msg}<br><small>Check your Space Settings → Secrets to add the required environment variables.</small></div>'

	for idx, study in enumerate(studies):
	status_badge_color = {
	'RECRUITING': '#28a745',
	'NOT_YET_RECRUITING': '#ffc107',
	'ACTIVE_NOT_RECRUITING': '#17a2b8',
	'COMPLETED': '#6c757d',
	'SUSPENDED': '#dc3545',
	'TERMINATED': '#dc3545',
	'WITHDRAWN': '#6c757d'
	}.get(study['status'], '#6c757d')

	# Add relevance score indicator if ranking was applied
	relevance_indicator = ""
	if 'relevance_score' in study:
	score = study.get('relevance_score', 0)
	score_percent = int(score * 100)
	# Color based on score: green for high, yellow for medium, gray for low
	if score >= 0.7:
	score_color = '#28a745'
	elif score >= 0.4:
	score_color = '#ffc107'
	else:
	score_color = '#6c757d'
	relevance_indicator = f'<span style="background: {score_color}; color: white; padding: 0.25rem 0.75rem; border-radius: 12px; font-size: 0.85rem; font-weight: 600; margin-left: 0.5rem;">Relevance: {score_percent}%</span>'

	html_results += f"""
	<div style="border: 1px solid #e0e0e0; border-radius: 8px; padding: 1.5rem; margin: 1rem 0; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
	<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 1rem;">
	<h3 style="margin: 0; color: #333; flex: 1;">{study['title']}</h3>
	<div style="display: flex; align-items: center; gap: 0.5rem;">
	<span style="background: {status_badge_color}; color: white; padding: 0.25rem 0.75rem; border-radius: 12px; font-size: 0.85rem; font-weight: 600;">
	{study['status']}
	</span>
	{relevance_indicator}
	</div>
	</div>

	<p><strong>NCT ID:</strong> {study['nctId']}</p>
	"""

	if study.get('sponsor'):
	html_results += f"<p><strong>Sponsor:</strong> {study['sponsor']}</p>"

	if study.get('conditions'):
	html_results += f"<p><strong>Conditions:</strong> {', '.join(study['conditions'])}</p>"

	if study.get('briefSummary'):
	summary = study['briefSummary'][:600] + '...' if len(study['briefSummary']) > 600 else study['briefSummary']
	html_results += f'<p style="color: #666; line-height: 1.6;"><strong>Summary:</strong> {summary}</p>'

	if study.get('inclusionCriteria'):
	html_results += "<details style='margin-top: 1rem;'><summary style='cursor: pointer; color: #667eea; font-weight: 600;'>Inclusion Criteria ▼</summary><ul style='margin-top: 0.5rem; padding-left: 1.5rem;'>"
	for criterion in study['inclusionCriteria']:
	html_results += f"<li style='margin-bottom: 0.5rem;'>{criterion}</li>"
	html_results += "</ul></details>"

	if study.get('exclusionCriteria'):
	html_results += "<details style='margin-top: 1rem;'><summary style='cursor: pointer; color: #667eea; font-weight: 600;'>Exclusion Criteria ▼</summary><ul style='margin-top: 0.5rem; padding-left: 1.5rem;'>"
	for criterion in study['exclusionCriteria']:
	html_results += f"<li style='margin-bottom: 0.5rem;'>{criterion}</li>"
	html_results += "</ul></details>"

	if study.get('ranking_reasoning'):
	html_results += f"<p style='margin-top: 1rem; padding: 0.75rem; background: #e8f4f8; border-left: 3px solid #667eea; border-radius: 4px; font-size: 0.9rem;'><strong>Ranking Reasoning:</strong> {study['ranking_reasoning']}</p>"

	html_results += f"""
	<div style="margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #e0e0e0; display: flex; justify-content: space-between; align-items: center;">
	<span style="color: #888; font-size: 0.9rem;">Last updated: {study['lastUpdateDate']}</span>
	<a href="https://clinicaltrials.gov/study/{study['nctId']}" target="_blank" style="color: #667eea; text-decoration: none; font-weight: 600;">View on ClinicalTrials.gov →</a>
	</div>
	</div>
	"""

	return html_results, len(studies), llm_model_name

	except Exception as e:
	return f"Error: {str(e)}", 0, None


	# Create Gradio interface
	with gr.Blocks(title="Clinical Trial Matcher", theme=gr.themes.Soft()) as demo:
	# Inject PWA meta tags for iOS/Android installation
	gr.HTML("""
	<script>
	(function() {
	// Add PWA meta tags to document head
	const metaTags = [
	{ name: 'apple-mobile-web-app-capable', content: 'yes' },
	{ name: 'apple-mobile-web-app-status-bar-style', content: 'default' },
	{ name: 'apple-mobile-web-app-title', content: 'Trial Matcher' },
	{ name: 'mobile-web-app-capable', content: 'yes' },
	{ name: 'theme-color', content: '#4a90e2' }
	];

	metaTags.forEach(tag => {
	let meta = document.querySelector(`meta[name="${tag.name}"]`);
	if (!meta) {
	meta = document.createElement('meta');
	meta.setAttribute('name', tag.name);
	document.head.appendChild(meta);
	}
	meta.setAttribute('content', tag.content);
	});

	// Add manifest link
	let manifestLink = document.querySelector('link[rel="manifest"]');
	if (!manifestLink) {
	manifestLink = document.createElement('link');
	manifestLink.setAttribute('rel', 'manifest');
	document.head.appendChild(manifestLink);
	}
	manifestLink.setAttribute('href', 'manifest.json');
	})();
	</script>
	""", visible=False)
	gr.Markdown("""
	# 🔬 Clinical Trial Matcher

	Search and filter clinical trials from [ClinicalTrials.gov](https://clinicaltrials.gov/) with AI-powered fine-tuning of the results based on your query.

	""")

	with gr.Row():
	with gr.Column(scale=1):
	query_input = gr.Textbox(
	label="Search Keywords",
	placeholder="e.g., pancreatic cancer, PDAC, KRAS, etc.",
	value=""
	)
	country_input = gr.Dropdown(
	label="Country (Optional)",
	choices=[
	"", "Germany", "United States", "United Kingdom", "Canada", "France",
	"Italy", "Spain", "Netherlands", "Belgium", "Switzerland", "Austria",
	"Sweden", "Norway", "Denmark", "Finland", "Poland", "Czech Republic",
	"Australia", "New Zealand", "Japan", "China", "India", "South Korea",
	"Brazil", "Mexico", "Argentina", "Chile", "South Africa", "Israel",
	"Turkey", "Russia", "Greece", "Portugal", "Ireland"
	],
	value="",
	filterable=True,
	interactive=True
	)
	status_input = gr.Dropdown(
	label="Status (Optional)",
	choices=["", "RECRUITING", "NOT_YET_RECRUITING", "ACTIVE_NOT_RECRUITING",
	"COMPLETED", "SUSPENDED", "TERMINATED", "WITHDRAWN"],
	value="",
	interactive=True
	)
	ranking_input = gr.Textbox(
	label="✨ Use AI to sort the results based on my query:",
	placeholder="e.g., KRAS mutation, immunotherapy",
	value="",
	visible=False
	)
	ranking_btn = gr.Button("Rank Results", variant="secondary", visible=False)
	search_btn = gr.Button("Search Clinical Trials", variant="primary", size="lg")

	with gr.Column(scale=2):
	results_output = gr.HTML(label="Results")
	count_output = gr.Textbox(label="Total Studies Found", visible=False)

	# Search function
	def perform_search(query, country, status, ranking_terms=""):
	html, count, model = search_trials(query, country, status, ranking_terms)
	# Show ranking input and button after search
	return html, count, gr.update(visible=True), gr.update(visible=True)

	# Ranking function
	def perform_ranking(query, country, status, ranking_terms):
	if not ranking_terms or not ranking_terms.strip():
	return "Please enter ranking terms to sort the results.", 0
	print(f"Ranking requested with terms: {ranking_terms}")
	html, count, model = search_trials(query, country, status, ranking_terms)
	return html, count

	search_btn.click(
	fn=perform_search,
	inputs=[query_input, country_input, status_input, ranking_input],
	outputs=[results_output, count_output, ranking_input, ranking_btn]
	)

	ranking_btn.click(
	fn=perform_ranking,
	inputs=[query_input, country_input, status_input, ranking_input],
	outputs=[results_output, count_output]
	)

	gr.Markdown("""
	---
	App. developed by [Mackenzie Weygandt Mathis](https://en.wikipedia.org/wiki/Mackenzie_Weygandt_Mathis). Data sourced from [ClinicalTrials.gov](https://clinicaltrials.gov)

	This app uses AI-powered ranking with Hugging Face models (default: DeepSeek-V3.2) to intelligently rank search results by relevance.
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)