Spaces:
Build error
Build error
| """ | |
| Hugging Face Spaces Entry Point. | |
| Simplified Gradio app for deployment to HuggingFace Spaces. | |
| """ | |
| import gradio as gr | |
| import os | |
| from pathlib import Path | |
| # Download NLTK data at startup | |
| import nltk | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('punkt_tab', quiet=True) | |
| nltk.download('averaged_perceptron_tagger', quiet=True) | |
| # Import RAG components | |
| from src.preprocessing import PDFParser, TextChunker | |
| from src.embeddings import CustomEmbedder | |
| from src.retrieval import FAISSVectorStore, HybridRetriever, DenseRetriever, SparseRetriever, RAGPipeline, Document | |
| # Global state | |
| rag_pipeline = None | |
| embedder = None | |
| vector_store = None | |
| def initialize(): | |
| """Initialize the system on startup.""" | |
| global embedder, vector_store | |
| embedder = CustomEmbedder() | |
| vector_store = FAISSVectorStore(embedding_dim=embedder.embedding_dim) | |
| return "[OK] System initialized!" | |
| def upload_and_process(files): | |
| """Process uploaded PDF files.""" | |
| global vector_store, embedder, rag_pipeline | |
| if not files: | |
| return "[ERROR] Please upload PDF files" | |
| if embedder is None: | |
| initialize() | |
| pdf_parser = PDFParser() | |
| chunker = TextChunker(chunk_size=512, chunk_overlap=50) | |
| all_chunks = [] | |
| for file in files: | |
| file_path = Path(file.name) | |
| if file_path.suffix.lower() == ".pdf": | |
| doc = pdf_parser.parse(file_path) | |
| for page in doc.pages: | |
| chunks = chunker.chunk(page.text) | |
| for chunk in chunks: | |
| chunk.metadata["source"] = file_path.name | |
| chunk.metadata["page"] = page.page_number | |
| all_chunks.append(chunk) | |
| if not all_chunks: | |
| return "[ERROR] No text extracted from PDFs" | |
| # Create documents and add to vector store | |
| documents = [ | |
| Document( | |
| id=f"doc_{i}", | |
| text=chunk.text, | |
| metadata=chunk.metadata | |
| ) | |
| for i, chunk in enumerate(all_chunks) | |
| ] | |
| embeddings = embedder.encode([d.text for d in documents]) | |
| vector_store.add_documents(documents, embeddings) | |
| # Initialize RAG pipeline with proper retrievers | |
| dense_retriever = DenseRetriever(vector_store=vector_store, embedder=embedder) | |
| sparse_retriever = SparseRetriever(documents=documents) | |
| retriever = HybridRetriever(dense_retriever=dense_retriever, sparse_retriever=sparse_retriever) | |
| rag_pipeline = RAGPipeline(retriever=retriever, model_name="qwen2") | |
| return f"[OK] Processed {len(files)} files, {len(documents)} chunks indexed!" | |
| def query(message, history): | |
| """Query the RAG system.""" | |
| global rag_pipeline | |
| if rag_pipeline is None: | |
| return "[ERROR] Please upload documents first!" | |
| if not message.strip(): | |
| return "[ERROR] Please enter a question" | |
| try: | |
| response = rag_pipeline.query(message, top_k=5) | |
| answer = response.answer | |
| if response.citations: | |
| answer += "\n\n---\n**Sources:**\n" | |
| for i, c in enumerate(response.citations[:3], 1): | |
| answer += f"\n[{i}] {c.source_file}" | |
| if c.page: | |
| answer += f" (p.{c.page})" | |
| return answer | |
| except Exception as e: | |
| return f"[ERROR] Error: {str(e)}" | |
| # Build Gradio interface | |
| with gr.Blocks( | |
| title="Multimodal RAG System", | |
| theme=gr.themes.Soft(primary_hue="blue") | |
| ) as demo: | |
| gr.Markdown(""" | |
| # Multimodal RAG System | |
| Upload PDF documents and ask questions! | |
| """) | |
| with gr.Tab("Upload Documents"): | |
| file_upload = gr.File( | |
| label="Upload PDFs", | |
| file_count="multiple", | |
| file_types=[".pdf"] | |
| ) | |
| upload_btn = gr.Button("Process Documents", variant="primary") | |
| upload_status = gr.Textbox(label="Status", interactive=False) | |
| upload_btn.click(upload_and_process, inputs=[file_upload], outputs=[upload_status]) | |
| with gr.Tab("Chat"): | |
| chatbot = gr.ChatInterface( | |
| fn=query, | |
| title="Ask Questions", | |
| examples=[ | |
| "What is this document about?", | |
| "Summarize the main points", | |
| "What are the key findings?" | |
| ] | |
| ) | |
| gr.Markdown("---\n*Powered by FAISS, Sentence Transformers & Open-Source LLMs*") | |
| if __name__ == "__main__": | |
| # Initialize on startup | |
| initialize() | |
| # Launch | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |