Spaces:

MCP-1st-Birthday
/

sdlc-agent

Runtime error

App Files Files Community

sdlc-agent / src /rag /modal-rag-product-design.py

Veeru-c

initial commit

fc9883e 22 days ago

raw

history blame contribute delete

20.1 kB

	"""
	Extended Modal RAG to include product design documents (Word, PDF, Excel)
	This extends the existing modal-rag.py to support querying the product design spec
	"""

	import modal

	app = modal.App("insurance-rag-product-design")

	# Reference your specific volume
	vol = modal.Volume.from_name("mcp-hack-ins-products", create_if_missing=True)

	# Model configuration (same as existing)
	LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
	EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"

	# Build image with ALL required dependencies
	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install(
	"vllm==0.6.3.post1",
	"langchain==0.3.7",
	"langchain-community==0.3.7",
	"langchain-text-splitters==0.3.2",
	"sentence-transformers==3.3.0",
	"chromadb==0.5.20",
	"pypdf==5.1.0", # For PDF documents
	"python-docx==1.1.0", # For Word documents
	"openpyxl==3.1.2", # For Excel documents (.xlsx)
	"pandas==2.2.0", # For Excel data processing
	"xlrd==2.0.1", # For older Excel files (.xls)
	"cryptography==43.0.3",
	"transformers==4.46.2",
	"torch==2.4.0",
	"huggingface_hub==0.26.2",
	)
	)

	@app.function(image=image, volumes={"/insurance-data": vol})
	def list_volume_files():
	"""List all files in the volume to debug"""
	import os
	print("🔍 Listing all files in /insurance-data...")
	all_files = []
	for root, dirs, files in os.walk("/insurance-data"):
	for file in files:
	full_path = os.path.join(root, file)
	all_files.append(full_path)
	print(f" 📄 {full_path}")
	return all_files

	@app.function(image=image, volumes={"/insurance-data": vol})
	def load_product_design_docs():
	"""Load product design documents (Word, PDF, Excel only - no markdown)"""
	import os
	import docx
	from pathlib import Path

	documents = []

	# First, list what's actually in the volume for debugging
	print("🔍 Scanning volume for product design documents (Word, PDF, Excel only)...")
	all_files = []
	for root, dirs, files in os.walk("/insurance-data"):
	for file in files:
	full_path = os.path.join(root, file)
	all_files.append(full_path)
	# Only show supported file types
	file_lower = file.lower()
	if file.endswith(('.docx', '.pdf', '.xlsx', '.xls')):
	if 'tokyo_auto_insurance' in file_lower or 'product_design' in file_lower:
	print(f" 📄 Found: {full_path}")

	# Load PDF files
	pdf_files = []
	for root, dirs, files in os.walk("/insurance-data"):
	for file in files:
	if file.endswith('.pdf'):
	full_path = os.path.join(root, file)
	file_lower = file.lower()
	if 'tokyo_auto_insurance' in file_lower or 'product_design' in file_lower:
	pdf_files.append(full_path)

	print(f"📄 Found {len(pdf_files)} PDF product design files")
	for pdf_file in pdf_files:
	try:
	from pypdf import PdfReader
	reader = PdfReader(pdf_file)
	text_content = []
	for page in reader.pages:
	text_content.append(page.extract_text())

	full_text = '\n'.join(text_content)
	if not full_text.strip():
	print(f" ⚠️ No text extracted from {pdf_file}")
	continue

	documents.append({
	'page_content': full_text,
	'metadata': {
	'source': pdf_file,
	'type': 'product_design',
	'format': 'pdf'
	}
	})
	print(f" ✅ Loaded: {pdf_file} ({len(full_text)} characters)")
	except Exception as e:
	print(f" ⚠️ Error loading {pdf_file}: {e}")

	# Load Excel files
	excel_files = []
	for root, dirs, files in os.walk("/insurance-data"):
	for file in files:
	if file.endswith(('.xlsx', '.xls')):
	full_path = os.path.join(root, file)
	file_lower = file.lower()
	if 'tokyo_auto_insurance' in file_lower or 'product_design' in file_lower:
	excel_files.append(full_path)

	print(f"📊 Found {len(excel_files)} Excel product design files")
	for excel_file in excel_files:
	try:
	import pandas as pd
	# Read all sheets
	excel_data = pd.read_excel(excel_file, sheet_name=None)
	text_content = []

	for sheet_name, df in excel_data.items():
	text_content.append(f"Sheet: {sheet_name}")
	# Convert DataFrame to text representation
	text_content.append(df.to_string())
	text_content.append("") # Empty line between sheets

	full_text = '\n'.join(text_content)
	if not full_text.strip():
	print(f" ⚠️ No data extracted from {excel_file}")
	continue

	documents.append({
	'page_content': full_text,
	'metadata': {
	'source': excel_file,
	'type': 'product_design',
	'format': 'excel'
	}
	})
	print(f" ✅ Loaded: {excel_file} ({len(full_text)} characters)")
	except Exception as e:
	print(f" ⚠️ Error loading {excel_file}: {e}")

	# Load Word documents - check both root and docs subdirectory
	docx_files = []
	for root, dirs, files in os.walk("/insurance-data"):
	for file in files:
	if file.endswith('.docx'):
	full_path = os.path.join(root, file)
	# Match product design files (case insensitive)
	file_lower = file.lower()
	if 'product_design' in file_lower or 'tokyo_auto_insurance' in file_lower:
	docx_files.append(full_path)

	print(f"📄 Found {len(docx_files)} Word product design files")
	for docx_file in docx_files:
	try:
	# Check if file exists
	if not os.path.exists(docx_file):
	print(f" ⚠️ File does not exist: {docx_file}")
	continue

	# Check file size
	file_size = os.path.getsize(docx_file)
	print(f" 📊 File size: {file_size} bytes")

	# Try opening with python-docx
	# python-docx might have issues with Modal volume files, so we'll try a workaround
	try:
	doc = docx.Document(docx_file)
	except Exception as e1:
	# If direct opening fails, try copying to temp first
	import tempfile
	import shutil
	print(f" ⚠️ Direct open failed: {e1}, trying temp copy...")
	with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp:
	shutil.copy2(docx_file, tmp.name)
	doc = docx.Document(tmp.name)
	tmp_path = tmp.name

	text_content = []
	for para in doc.paragraphs:
	if para.text.strip():
	text_content.append(para.text)

	# Also extract tables
	for table in doc.tables:
	for row in table.rows:
	row_text = ' \| '.join([cell.text for cell in row.cells])
	if row_text.strip():
	text_content.append(row_text)

	full_text = '\n'.join(text_content)
	if not full_text.strip():
	print(f" ⚠️ No text extracted from {docx_file}")
	continue

	documents.append({
	'page_content': full_text,
	'metadata': {
	'source': docx_file,
	'type': 'product_design',
	'format': 'word'
	}
	})
	print(f" ✅ Loaded: {docx_file} ({len(full_text)} characters)")

	# Clean up temp file if we created one
	if 'tmp_path' in locals():
	try:
	os.unlink(tmp_path)
	except:
	pass

	except Exception as e:
	print(f" ⚠️ Error loading {docx_file}: {e}")
	import traceback
	print(f" Traceback: {traceback.format_exc()}")

	print(f"✅ Loaded {len(documents)} product design documents")
	return documents

	@app.function(
	image=image,
	volumes={"/insurance-data": vol},
	timeout=900
	)
	def create_product_design_vector_db():
	"""Create vector database from product design documents"""
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	print("🔍 Loading product design documents...")
	documents = load_product_design_docs.remote()

	if len(documents) == 0:
	return {
	"status": "error",
	"message": "No product design documents found",
	"total_documents": 0,
	"total_chunks": 0
	}

	print("✂️ Splitting documents into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)

	# Convert to LangChain document format
	from langchain.schema import Document
	langchain_docs = []
	for doc in documents:
	langchain_docs.append(Document(
	page_content=doc['page_content'],
	metadata=doc['metadata']
	))

	chunks = text_splitter.split_documents(langchain_docs)
	print(f"📦 Created {len(chunks)} chunks")

	print("🧠 Creating embeddings...")
	# Try CUDA first, fall back to CPU if not available
	import torch
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f" Using device: {device}")

	embeddings = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	model_kwargs={'device': device},
	encode_kwargs={'normalize_embeddings': True}
	)

	print("💾 Building vector database...")

	# Connect to remote Chroma service
	chroma_service = modal.Cls.from_name("chroma-server-v2", "ChromaDB")()

	# Prepare data for upsert
	ids = [f"product_design_{i}" for i in range(len(chunks))]
	documents_text = [chunk.page_content for chunk in chunks]
	metadatas = [chunk.metadata for chunk in chunks]

	# Generate embeddings locally
	print(" Generating embeddings locally...")
	embeddings_list = embeddings.embed_documents(documents_text)

	# Upsert to remote Chroma (use separate collection for product design)
	print(" Upserting to remote Chroma DB...")
	batch_size = 100
	for i in range(0, len(ids), batch_size):
	batch_ids = ids[i:i+batch_size]
	batch_docs = documents_text[i:i+batch_size]
	batch_metas = metadatas[i:i+batch_size]
	batch_embs = embeddings_list[i:i+batch_size]

	chroma_service.upsert.remote(
	collection_name="product_design", # Separate collection
	ids=batch_ids,
	documents=batch_docs,
	embeddings=batch_embs,
	metadatas=batch_metas
	)
	print(f" Upserted batch {i//batch_size + 1}/{(len(ids)-1)//batch_size + 1}")

	print("✅ Product design vector database created!")

	return {
	"status": "success",
	"total_documents": len(documents),
	"total_chunks": len(chunks)
	}

	@app.cls(
	image=image,
	volumes={"/insurance-data": vol},
	gpu="A10G",
	timeout=600,
	max_containers=1,
	min_containers=0
	)
	class ProductDesignRAG:
	"""RAG model specifically for product design document queries"""

	@modal.enter()
	def enter(self):
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from vllm import LLM, SamplingParams
	from langchain.schema import Document

	print("🚀 Initializing Product Design RAG...")

	# Initialize embeddings
	self.embeddings = HuggingFaceEmbeddings(
	model_name=EMBEDDING_MODEL,
	model_kwargs={'device': 'cuda'},
	encode_kwargs={'normalize_embeddings': True}
	)

	# Connect to Chroma
	self.chroma_service = modal.Cls.from_name("chroma-server-v2", "ChromaDB")()

	# Custom retriever for remote Chroma
	class RemoteChromaRetriever:
	def __init__(self, chroma_service, embeddings, k=3):
	self.chroma_service = chroma_service
	self.embeddings = embeddings
	self.k = k

	def get_relevant_documents(self, query: str):
	query_embedding = self.embeddings.embed_query(query)
	results = self.chroma_service.query.remote(
	collection_name="product_design",
	query_embeddings=[query_embedding],
	n_results=self.k
	)

	docs = []
	if results and 'documents' in results and len(results['documents']) > 0:
	for i, doc_text in enumerate(results['documents'][0]):
	metadata = results.get('metadatas', [[{}]])[0][i] if 'metadatas' in results else {}
	docs.append(Document(page_content=doc_text, metadata=metadata))

	return docs

	self.RemoteChromaRetriever = RemoteChromaRetriever

	# Load LLM
	print(" Loading LLM...")
	self.llm_engine = LLM(
	model=LLM_MODEL,
	dtype="float16",
	gpu_memory_utilization=0.85,
	max_model_len=4096,
	trust_remote_code=True,
	enforce_eager=True
	)

	self.sampling_params = SamplingParams(
	temperature=0.7,
	max_tokens=1536, # Increased for comprehensive, detailed answers
	top_p=0.9,
	stop=["\n\n\n", "Question:", "Context:", "<\|end\|>"] # Removed single \n\n to allow longer answers
	)

	print("✅ Product Design RAG ready!")

	@modal.method()
	def query(self, question: str, top_k: int = 5): # Increased from 3 to 5 for more context
	"""Query the product design document"""
	import time
	start_time = time.time()

	print(f"❓ Query: {question}")

	# Retrieve relevant documents
	retrieval_start = time.time()
	retriever = self.RemoteChromaRetriever(
	chroma_service=self.chroma_service,
	embeddings=self.embeddings,
	k=top_k
	)
	docs = retriever.get_relevant_documents(question)
	retrieval_time = time.time() - retrieval_start

	if not docs:
	return {
	"question": question,
	"answer": "No relevant information found in the product design document.",
	"retrieval_time": retrieval_time,
	"generation_time": 0,
	"sources": []
	}

	# Build context
	context = "\n\n".join([doc.page_content for doc in docs])

	# Create prompt with instructions for comprehensive answers
	prompt = f"""<\|system\|>
	You are a helpful AI assistant that answers questions about the TokyoDrive Insurance product design document.
	Provide comprehensive, detailed answers with specific information from the document.
	Structure your answer clearly with:
	- A brief summary if relevant
	- Detailed explanations with specific numbers, percentages, and data points
	- Step-by-step guidance when appropriate
	- Clear formatting (use bullet points or numbered lists when helpful)
	Be thorough and cite specific details from the context. If information is not available, say so clearly.<\|end\|>
	<\|user\|>
	Context from Product Design Document:
	{context}

	Question:
	{question}<\|end\|>
	<\|assistant\|>"""

	# Generate answer
	outputs = self.llm_engine.generate(prompts=[prompt], sampling_params=self.sampling_params)
	answer = outputs[0].outputs[0].text.strip()
	generation_time = time.time() - start_time - retrieval_time

	# Prepare sources with better content extraction
	sources = []
	for doc in docs:
	# Clean up source content - remove markdown table syntax
	content = doc.page_content
	# Remove markdown table separators
	import re
	content = re.sub(r'\\|[\s\-:]+\\|', '', content)
	content = re.sub(r'^\\|.*\\|$', '', content, flags=re.MULTILINE)
	content = re.sub(r'\s+\\|\s+', ' ', content)
	content = content.strip()

	sources.append({
	"content": content[:500], # Increased from 300 to 500
	"metadata": doc.metadata
	})

	return {
	"question": question,
	"answer": answer,
	"retrieval_time": retrieval_time,
	"generation_time": generation_time,
	"sources": sources
	}

	@app.local_entrypoint()
	def list_files():
	"""List files in volume for debugging"""
	print("🔍 Listing files in volume...")
	files = list_volume_files.remote()
	print(f"\n✅ Found {len(files)} files total")

	@app.local_entrypoint()
	def index_product_design():
	"""Index product design documents"""
	print("🚀 Indexing product design documents...")

	# First, list files to debug
	print("\n📋 Checking volume contents...")
	try:
	files = list_volume_files.remote()
	print(f"Found {len(files)} files in volume")
	except Exception as e:
	print(f"Could not list files: {e}")

	result = create_product_design_vector_db.remote()
	print(f"\n{'='*60}")
	print(f"Status: {result['status']}")
	if result['status'] == 'success':
	print(f"Documents processed: {result['total_documents']}")
	print(f"Text chunks created: {result['total_chunks']}")
	print("✅ Product design vector database is ready!")
	else:
	print(f"❌ Error: {result['message']}")
	print("\n💡 Tip: Make sure files are uploaded to the volume:")
	print(" modal volume put mcp-hack-ins-products \\")
	print(" docs/tokyo_auto_insurance_product_design.docx")
	print(" # Or PDF/Excel files:")
	print(" # modal volume put mcp-hack-ins-products docs/file.pdf")
	print(" # modal volume put mcp-hack-ins-products docs/file.xlsx")
	print(f"{'='*60}")

	@app.local_entrypoint()
	def query_product_design(question: str = "What are the three product tiers and their premium ranges?"):
	"""Query the product design document"""
	print(f"🤔 Question: {question}\n")

	model = ProductDesignRAG()
	result = model.query.remote(question)

	print(f"{'='*60}")
	print(f"📝 Answer:")
	print(f"{result['answer']}\n")
	print(f"{'='*60}")
	print(f"⏱️ Retrieval: {result['retrieval_time']:.2f}s")
	print(f"⏱️ Generation: {result['generation_time']:.2f}s")

	if result['sources']:
	print(f"\n📚 Sources ({len(result['sources'])}):")
	for i, source in enumerate(result['sources'], 1):
	print(f"\n{i}. {source['metadata'].get('source', 'Unknown')}")
	print(f" {source['content'][:200]}...")

	# Define data model for API
	from pydantic import BaseModel

	class RAGQuery(BaseModel):
	question: str
	top_k: int = 5

	@app.function(image=image)
	@modal.web_endpoint(method="POST")
	def api_query(item: RAGQuery):
	"""Expose RAG query as a web endpoint"""
	model = ProductDesignRAG()
	return model.query.remote(item.question, item.top_k)