Spaces:

hashirama7x
/

multimodal-rag

Build error

itachi

Remove remaining emojis from app.py

2498ba5 2 months ago

4.58 kB

	"""
	Hugging Face Spaces Entry Point.
	Simplified Gradio app for deployment to HuggingFace Spaces.
	"""

	import gradio as gr
	import os
	from pathlib import Path

	# Download NLTK data at startup
	import nltk
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)

	# Import RAG components
	from src.preprocessing import PDFParser, TextChunker
	from src.embeddings import CustomEmbedder
	from src.retrieval import FAISSVectorStore, HybridRetriever, DenseRetriever, SparseRetriever, RAGPipeline, Document

	# Global state
	rag_pipeline = None
	embedder = None
	vector_store = None


	def initialize():
	"""Initialize the system on startup."""
	global embedder, vector_store
	embedder = CustomEmbedder()
	vector_store = FAISSVectorStore(embedding_dim=embedder.embedding_dim)
	return "[OK] System initialized!"


	def upload_and_process(files):
	"""Process uploaded PDF files."""
	global vector_store, embedder, rag_pipeline

	if not files:
	return "[ERROR] Please upload PDF files"

	if embedder is None:
	initialize()

	pdf_parser = PDFParser()
	chunker = TextChunker(chunk_size=512, chunk_overlap=50)
	all_chunks = []

	for file in files:
	file_path = Path(file.name)
	if file_path.suffix.lower() == ".pdf":
	doc = pdf_parser.parse(file_path)
	for page in doc.pages:
	chunks = chunker.chunk(page.text)
	for chunk in chunks:
	chunk.metadata["source"] = file_path.name
	chunk.metadata["page"] = page.page_number
	all_chunks.append(chunk)

	if not all_chunks:
	return "[ERROR] No text extracted from PDFs"

	# Create documents and add to vector store
	documents = [
	Document(
	id=f"doc_{i}",
	text=chunk.text,
	metadata=chunk.metadata
	)
	for i, chunk in enumerate(all_chunks)
	]

	embeddings = embedder.encode([d.text for d in documents])
	vector_store.add_documents(documents, embeddings)

	# Initialize RAG pipeline with proper retrievers
	dense_retriever = DenseRetriever(vector_store=vector_store, embedder=embedder)
	sparse_retriever = SparseRetriever(documents=documents)
	retriever = HybridRetriever(dense_retriever=dense_retriever, sparse_retriever=sparse_retriever)
	rag_pipeline = RAGPipeline(retriever=retriever, model_name="qwen2")

	return f"[OK] Processed {len(files)} files, {len(documents)} chunks indexed!"


	def query(message, history):
	"""Query the RAG system."""
	global rag_pipeline

	if rag_pipeline is None:
	return "[ERROR] Please upload documents first!"

	if not message.strip():
	return "[ERROR] Please enter a question"

	try:
	response = rag_pipeline.query(message, top_k=5)
	answer = response.answer

	if response.citations:
	answer += "\n\n---\nSources:\n"
	for i, c in enumerate(response.citations[:3], 1):
	answer += f"\n[{i}] {c.source_file}"
	if c.page:
	answer += f" (p.{c.page})"

	return answer
	except Exception as e:
	return f"[ERROR] Error: {str(e)}"


	# Build Gradio interface
	with gr.Blocks(
	title="Multimodal RAG System",
	theme=gr.themes.Soft(primary_hue="blue")
	) as demo:

	gr.Markdown("""
	# Multimodal RAG System
	Upload PDF documents and ask questions!
	""")

	with gr.Tab("Upload Documents"):
	file_upload = gr.File(
	label="Upload PDFs",
	file_count="multiple",
	file_types=[".pdf"]
	)
	upload_btn = gr.Button("Process Documents", variant="primary")
	upload_status = gr.Textbox(label="Status", interactive=False)

	upload_btn.click(upload_and_process, inputs=[file_upload], outputs=[upload_status])

	with gr.Tab("Chat"):
	chatbot = gr.ChatInterface(
	fn=query,
	title="Ask Questions",
	examples=[
	"What is this document about?",
	"Summarize the main points",
	"What are the key findings?"
	]
	)

	gr.Markdown("---\nPowered by FAISS, Sentence Transformers & Open-Source LLMs")


	if __name__ == "__main__":
	# Initialize on startup
	initialize()

	# Launch
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)