Spaces:

Harrisun
/

MedRegRAG

Sleeping

App Files Files Community

MedRegRAG / app.py

Harrisun

Create app.py

0d1b42d verified 6 months ago

raw

history blame contribute delete

22.5 kB

	import gradio as gr
	import os
	import json
	import pickle
	from datetime import datetime
	import requests
	from bs4 import BeautifulSoup
	import fitz # PyMuPDF for PDF processing
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import sqlite3
	import hashlib
	from typing import List, Dict, Any, Tuple
	import logging
	import tempfile
	import shutil
	from urllib.parse import urlparse, urljoin
	import re

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class MedicalRAGSystem:
	def __init__(self):
	self.embedding_model = None
	self.db_path = "medical_rag.db"
	self.embeddings_cache = {}
	self.init_database()
	self.load_embedding_model()

	def load_embedding_model(self):
	"""Load a free sentence transformer model"""
	try:
	# Using a lightweight, free model suitable for regulatory text
	self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	logger.info("Embedding model loaded successfully")
	except Exception as e:
	logger.error(f"Error loading embedding model: {e}")
	return None

	def init_database(self):
	"""Initialize SQLite database for persistent storage"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	# Create tables for different source types
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	filename TEXT NOT NULL,
	content TEXT NOT NULL,
	content_hash TEXT UNIQUE,
	category TEXT NOT NULL,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	metadata TEXT
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS websites (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	url TEXT NOT NULL,
	content TEXT NOT NULL,
	content_hash TEXT UNIQUE,
	title TEXT,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	metadata TEXT
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS standards (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	standard_name TEXT NOT NULL,
	content TEXT NOT NULL,
	content_hash TEXT UNIQUE,
	version TEXT,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	metadata TEXT
	)
	''')

	cursor.execute('''
	CREATE TABLE IF NOT EXISTS embeddings (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	source_type TEXT NOT NULL,
	source_id INTEGER NOT NULL,
	chunk_index INTEGER NOT NULL,
	embedding BLOB NOT NULL,
	text_chunk TEXT NOT NULL
	)
	''')

	conn.commit()
	conn.close()
	logger.info("Database initialized successfully")

	def get_content_hash(self, content: str) -> str:
	"""Generate hash for content to avoid duplicates"""
	return hashlib.md5(content.encode()).hexdigest()

	def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	"""Split text into overlapping chunks for better retrieval"""
	words = text.split()
	chunks = []

	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	if chunk.strip():
	chunks.append(chunk)

	return chunks

	def process_pdf_document(self, file_path: str) -> Tuple[str, Dict]:
	"""Extract text content from PDF documents"""
	try:
	doc = fitz.open(file_path)
	text_content = ""
	metadata = {"pages": doc.page_count, "format": "PDF"}

	for page_num in range(doc.page_count):
	page = doc[page_num]
	text_content += page.get_text()

	doc.close()
	return text_content, metadata
	except Exception as e:
	logger.error(f"Error processing PDF: {e}")
	return "", {}

	def process_text_document(self, file_path: str) -> Tuple[str, Dict]:
	"""Process text documents"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	return content, {"format": "TEXT"}
	except Exception as e:
	logger.error(f"Error processing text document: {e}")
	return "", {}

	def scrape_website(self, url: str) -> Tuple[str, str, Dict]:
	"""Scrape content from regulatory websites"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Get title
	title = soup.title.string if soup.title else url

	# Extract main content
	content = soup.get_text()
	content = re.sub(r'\s+', ' ', content).strip()

	metadata = {
	"title": title,
	"url": url,
	"scraped_at": datetime.now().isoformat()
	}

	return content, title, metadata

	except Exception as e:
	logger.error(f"Error scraping website {url}: {e}")
	return "", "", {}

	def add_document(self, file_path: str, filename: str, category: str) -> str:
	"""Add document to the knowledge base"""
	try:
	# Determine file type and process accordingly
	if filename.lower().endswith('.pdf'):
	content, metadata = self.process_pdf_document(file_path)
	else:
	content, metadata = self.process_text_document(file_path)

	if not content:
	return "Error: Could not extract content from document"

	content_hash = self.get_content_hash(content)

	# Store in database
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	try:
	cursor.execute('''
	INSERT INTO documents (filename, content, content_hash, category, metadata)
	VALUES (?, ?, ?, ?, ?)
	''', (filename, content, content_hash, category, json.dumps(metadata)))

	doc_id = cursor.lastrowid
	conn.commit()

	# Generate embeddings
	self.generate_embeddings_for_content(content, 'document', doc_id)

	conn.close()
	return f"Document '{filename}' added successfully to category '{category}'"

	except sqlite3.IntegrityError:
	conn.close()
	return "Document already exists in the knowledge base"

	except Exception as e:
	logger.error(f"Error adding document: {e}")
	return f"Error adding document: {str(e)}"

	def add_website(self, url: str) -> str:
	"""Add website content to the knowledge base"""
	try:
	content, title, metadata = self.scrape_website(url)

	if not content:
	return "Error: Could not scrape website content"

	content_hash = self.get_content_hash(content)

	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	try:
	cursor.execute('''
	INSERT INTO websites (url, content, content_hash, title, metadata)
	VALUES (?, ?, ?, ?, ?)
	''', (url, content, content_hash, title, json.dumps(metadata)))

	website_id = cursor.lastrowid
	conn.commit()

	# Generate embeddings
	self.generate_embeddings_for_content(content, 'website', website_id)

	conn.close()
	return f"Website '{title}' added successfully"

	except sqlite3.IntegrityError:
	conn.close()
	return "Website already exists in the knowledge base"

	except Exception as e:
	logger.error(f"Error adding website: {e}")
	return f"Error adding website: {str(e)}"

	def add_standard(self, standard_name: str, content: str, version: str = "") -> str:
	"""Add standard content to the knowledge base"""
	try:
	if not content.strip():
	return "Error: Standard content cannot be empty"

	content_hash = self.get_content_hash(content)

	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	metadata = {"version": version, "added_at": datetime.now().isoformat()}

	try:
	cursor.execute('''
	INSERT INTO standards (standard_name, content, content_hash, version, metadata)
	VALUES (?, ?, ?, ?, ?)
	''', (standard_name, content, content_hash, version, json.dumps(metadata)))

	standard_id = cursor.lastrowid
	conn.commit()

	# Generate embeddings
	self.generate_embeddings_for_content(content, 'standard', standard_id)

	conn.close()
	return f"Standard '{standard_name}' added successfully"

	except sqlite3.IntegrityError:
	conn.close()
	return "Standard already exists in the knowledge base"

	except Exception as e:
	logger.error(f"Error adding standard: {e}")
	return f"Error adding standard: {str(e)}"

	def generate_embeddings_for_content(self, content: str, source_type: str, source_id: int):
	"""Generate embeddings for content chunks"""
	if not self.embedding_model:
	logger.error("Embedding model not available")
	return

	chunks = self.chunk_text(content)

	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	for i, chunk in enumerate(chunks):
	try:
	embedding = self.embedding_model.encode(chunk)
	embedding_blob = pickle.dumps(embedding)

	cursor.execute('''
	INSERT INTO embeddings (source_type, source_id, chunk_index, embedding, text_chunk)
	VALUES (?, ?, ?, ?, ?)
	''', (source_type, source_id, i, embedding_blob, chunk))

	except Exception as e:
	logger.error(f"Error generating embedding for chunk {i}: {e}")

	conn.commit()
	conn.close()

	def search_knowledge_base(self, query: str, top_k: int = 5) -> List[Dict]:
	"""Search the knowledge base using semantic similarity"""
	if not self.embedding_model:
	return []

	try:
	query_embedding = self.embedding_model.encode(query)

	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	# Get all embeddings
	cursor.execute('''
	SELECT e.source_type, e.source_id, e.text_chunk, e.embedding,
	CASE
	WHEN e.source_type = 'document' THEN d.filename
	WHEN e.source_type = 'website' THEN w.title
	WHEN e.source_type = 'standard' THEN s.standard_name
	END as source_name
	FROM embeddings e
	LEFT JOIN documents d ON e.source_type = 'document' AND e.source_id = d.id
	LEFT JOIN websites w ON e.source_type = 'website' AND e.source_id = w.id
	LEFT JOIN standards s ON e.source_type = 'standard' AND e.source_id = s.id
	''')

	results = []
	for row in cursor.fetchall():
	try:
	stored_embedding = pickle.loads(row[3])
	similarity = cosine_similarity([query_embedding], [stored_embedding])[0][0]

	results.append({
	'source_type': row[0],
	'source_id': row[1],
	'text_chunk': row[2],
	'source_name': row[4],
	'similarity': similarity
	})
	except Exception as e:
	logger.error(f"Error processing embedding: {e}")

	conn.close()

	# Sort by similarity and return top k
	results.sort(key=lambda x: x['similarity'], reverse=True)
	return results[:top_k]

	except Exception as e:
	logger.error(f"Error searching knowledge base: {e}")
	return []

	def get_knowledge_base_stats(self) -> Dict:
	"""Get statistics about the knowledge base"""
	conn = sqlite3.connect(self.db_path)
	cursor = conn.cursor()

	stats = {}

	# Count documents
	cursor.execute("SELECT COUNT(*) FROM documents")
	stats['documents'] = cursor.fetchone()[0]

	# Count websites
	cursor.execute("SELECT COUNT(*) FROM websites")
	stats['websites'] = cursor.fetchone()[0]

	# Count standards
	cursor.execute("SELECT COUNT(*) FROM standards")
	stats['standards'] = cursor.fetchone()[0]

	# Count total embeddings
	cursor.execute("SELECT COUNT(*) FROM embeddings")
	stats['embeddings'] = cursor.fetchone()[0]

	conn.close()
	return stats

	# Initialize the RAG system
	rag_system = MedicalRAGSystem()

	def handle_document_upload(files, category):
	"""Handle document upload"""
	if not files:
	return "No files selected"

	results = []
	for file in files:
	filename = os.path.basename(file.name)
	result = rag_system.add_document(file.name, filename, category)
	results.append(result)

	return "\n".join(results)

	def handle_website_addition(url):
	"""Handle website addition"""
	if not url.strip():
	return "Please enter a valid URL"

	return rag_system.add_website(url.strip())

	def handle_standard_addition(standard_name, content, version):
	"""Handle standard addition"""
	if not standard_name.strip() or not content.strip():
	return "Please provide both standard name and content"

	return rag_system.add_standard(standard_name.strip(), content.strip(), version.strip())

	def handle_search(query):
	"""Handle search queries"""
	if not query.strip():
	return "Please enter a search query", ""

	results = rag_system.search_knowledge_base(query.strip())

	if not results:
	return "No relevant results found", ""

	# Format results for display
	formatted_results = []
	context = []

	for i, result in enumerate(results, 1):
	similarity_pct = result['similarity'] * 100
	formatted_results.append(f"""
	Result {i} (Similarity: {similarity_pct:.1f}%)
	Source: {result['source_name']} ({result['source_type']})
	Content: {result['text_chunk'][:300]}{'...' if len(result['text_chunk']) > 300 else ''}
	---
	""")
	context.append(result['text_chunk'])

	# Generate a comprehensive answer based on the context
	answer = generate_answer(query, context)

	return "\n".join(formatted_results), answer

	def generate_answer(query: str, context: List[str]) -> str:
	"""Generate an answer based on the retrieved context"""
	# Simple extractive approach - in a production system, you might use a generative model
	relevant_info = []

	query_lower = query.lower()
	for chunk in context:
	# Find sentences that contain query terms
	sentences = chunk.split('.')
	for sentence in sentences:
	if any(term in sentence.lower() for term in query_lower.split()):
	relevant_info.append(sentence.strip())

	if relevant_info:
	# Remove duplicates and combine
	unique_info = list(dict.fromkeys(relevant_info))
	return "Based on the regulatory documents:\n\n" + "\n\n".join(unique_info[:3])
	else:
	return "The retrieved content may contain relevant information, but I couldn't extract a specific answer. Please review the search results above."

	def get_stats():
	"""Get knowledge base statistics"""
	stats = rag_system.get_knowledge_base_stats()
	return f"""
	Knowledge Base Statistics:
	- Documents: {stats['documents']}
	- Websites: {stats['websites']}
	- Standards: {stats['standards']}
	- Total Text Chunks: {stats['embeddings']}
	"""

	# Create Gradio interface
	with gr.Blocks(title="Medical Devices RAG System", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🏥 Medical Devices Regulatory RAG System

	A comprehensive knowledge base system for medical device regulatory analysts.
	Add documents, websites, and standards to build your regulatory knowledge base.
	""")

	with gr.Tabs():
	# Search Tab
	with gr.Tab("🔍 Search Knowledge Base"):
	gr.Markdown("### Search your regulatory knowledge base")

	search_input = gr.Textbox(
	placeholder="Enter your regulatory question (e.g., 'What are the requirements for Class II medical devices?')",
	label="Search Query",
	lines=2
	)
	search_button = gr.Button("Search", variant="primary")

	with gr.Row():
	with gr.Column():
	search_results = gr.Markdown(label="Search Results")
	with gr.Column():
	answer_output = gr.Markdown(label="Generated Answer")

	search_button.click(
	handle_search,
	inputs=[search_input],
	outputs=[search_results, answer_output]
	)

	# Add Documents Tab
	with gr.Tab("📄 Add Documents"):
	gr.Markdown("### Add regulatory documents (PDF, TXT)")

	document_files = gr.File(
	label="Upload Documents",
	file_count="multiple",
	file_types=[".pdf", ".txt", ".docx"]
	)
	document_category = gr.Dropdown(
	choices=["EU MDR 2017/745", "CMDR SOR/98-282", "MDCG", "MDSAP Audit Approach", "UK MDR", "Other"],
	label="Document Category",
	value="Other"
	)
	add_doc_button = gr.Button("Add Documents", variant="primary")
	doc_output = gr.Textbox(label="Result", lines=3)

	add_doc_button.click(
	handle_document_upload,
	inputs=[document_files, document_category],
	outputs=[doc_output]
	)

	# Add Websites Tab
	with gr.Tab("🌐 Add Websites"):
	gr.Markdown("### Add regulatory websites")

	website_url = gr.Textbox(
	placeholder="https://www.fda.gov/medical-devices/...",
	label="Website URL",
	lines=1
	)
	add_website_button = gr.Button("Add Website", variant="primary")
	website_output = gr.Textbox(label="Result", lines=3)

	gr.Markdown("Suggested regulatory websites:")
	gr.Markdown("""
	- US FDA 21CFR: https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfcfr/cfrsearch.cfm
	- EU Medical Devices: https://ec.europa.eu/health/medical-devices-sector_en
	- Health Canada Medical Devices: https://www.canada.ca/en/health-canada/services/drugs-health-products/medical-devices.html
	""")

	add_website_button.click(
	handle_website_addition,
	inputs=[website_url],
	outputs=[website_output]
	)

	# Add Standards Tab
	with gr.Tab("📋 Add Standards"):
	gr.Markdown("### Add regulatory standards")

	standard_name = gr.Textbox(
	placeholder="ISO 13485:2016",
	label="Standard Name",
	lines=1
	)
	standard_version = gr.Textbox(
	placeholder="2016 (optional)",
	label="Version",
	lines=1
	)
	standard_content = gr.Textbox(
	placeholder="Enter or paste the standard content here...",
	label="Standard Content",
	lines=10
	)
	add_standard_button = gr.Button("Add Standard", variant="primary")
	standard_output = gr.Textbox(label="Result", lines=3)

	add_standard_button.click(
	handle_standard_addition,
	inputs=[standard_name, standard_content, standard_version],
	outputs=[standard_output]
	)

	# Statistics Tab
	with gr.Tab("📊 Knowledge Base Stats"):
	gr.Markdown("### Knowledge Base Statistics")

	stats_button = gr.Button("Refresh Statistics", variant="secondary")
	stats_output = gr.Textbox(label="Statistics", lines=8)

	stats_button.click(
	get_stats,
	outputs=[stats_output]
	)

	# Load initial stats
	demo.load(get_stats, outputs=[stats_output])

	if __name__ == "__main__":
	demo.launch(share=True)