Spaces:

bajajhackrx
/

model

Sleeping

App Files Files Community

sohamchitimali commited on Aug 6, 2025

Commit

37dc810

1 Parent(s): 7d17396

Reducing Model Size

Browse files

Files changed (2) hide show

app.py +169 -85
requirements.txt +8 -22

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ from dataclasses import dataclass
 import hashlib
 from fastapi import FastAPI, Request, Header
 from fastapi.responses import JSONResponse
-import uvicorn
 import warnings
 warnings.filterwarnings('ignore')
@@ -25,7 +24,7 @@ warnings.filterwarnings('ignore')
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Create FastAPI app
 api_app = FastAPI(title="High-Performance HackRx API", description="Production-grade AI document query system")
 @api_app.post("/hackrx/run")
@@ -174,9 +173,9 @@ class PowerfulDocumentProcessor:
         return text.strip()
 class OptimizedChunker:
-    """Optimized chunking for better performance"""
-    def __init__(self, chunk_size: int = 512, overlap: int = 100, min_chunk_size: int = 150):
         self.chunk_size = chunk_size
         self.overlap = overlap
         self.min_chunk_size = min_chunk_size
@@ -265,7 +264,7 @@ class OptimizedChunker:
         return min(score, 3.0)
 class PowerfulQASystem:
-    """High-performance QA system using Qwen2.5-3B-Instruct with domain enhancements"""
     def __init__(self):
         self.qa_pipeline = None
@@ -274,36 +273,54 @@ class PowerfulQASystem:
         self.initialize_powerful_models()
     def initialize_powerful_models(self):
-        """Initialize Qwen2.5-3B-Instruct with 4-bit quantization"""
-        model_name = "Qwen/Qwen2.5-3B-Instruct"
-        logger.info(f"Loading high-performance model: {model_name} (4-bit quantized)")
         try:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4"
-            ) if torch.cuda.is_available() else None
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None,
-                quantization_config=quantization_config
             )
             self.qa_pipeline = pipeline(
                 "text-generation",
                 model=self.model,
                 tokenizer=self.tokenizer,
-                device=0 if torch.cuda.is_available() else -1,
-                max_new_tokens=150,
-                max_length=2048,
-                return_full_text=False
             )
-            logger.info(f"Qwen2.5-3B-Instruct loaded successfully {'with 4-bit quantization' if quantization_config else 'on CPU'}")
         except Exception as e:
-            logger.error(f"Failed to load Qwen2.5-3B-Instruct: {e}")
-            raise RuntimeError(f"Model loading failed: {str(e)}")
     def _enhance_question(self, question: str) -> str:
         """Enhance question for better model understanding"""
@@ -330,15 +347,19 @@ class PowerfulQASystem:
         start_time = time.time()
         try:
             enhanced_question = self._enhance_question(question)
-            prompt = f"[INST] Given the following context:\n{context[:2000]}\n\nAnswer the question: {enhanced_question} [/INST]"
-            result = self.qa_pipeline(prompt)[0]['generated_text'].strip()
             if not result:
                 result = "Unable to generate a meaningful answer based on the provided context."
             enhanced_answer = self._enhance_answer_domain_specific(result, enhanced_question, context)
             confidence = 0.9 if len(top_chunks) > 2 else 0.7
             reasoning = self._generate_reasoning(enhanced_question, enhanced_answer, confidence, top_chunks)
-            token_count = len(self.tokenizer.encode(prompt))
             processing_time = time.time() - start_time
             return {
@@ -346,9 +367,10 @@ class PowerfulQASystem:
                 'confidence': confidence,
                 'reasoning': reasoning,
                 'processing_time': processing_time,
-                'token_count': token_count,
                 'source_chunks': len(top_chunks)
             }
         except Exception as e:
             logger.error(f"Answer generation error: {e}")
             return {
@@ -368,6 +390,7 @@ class PowerfulQASystem:
         answer = answer.strip()
         question_lower = question.lower()
         if 'grace period' in question_lower:
             if any(term in answer.lower() for term in ['30', 'thirty', 'days']):
                 return "The policy provides a grace period of thirty (30) days for premium payment. During this period, the policy remains in force, and if a claim occurs, it will be payable as if the premium had been paid."
@@ -380,33 +403,7 @@ class PowerfulQASystem:
             if any(term in answer.lower() for term in ['24', 'twenty-four', 'months', 'cover']):
                 return "Yes, the policy covers maternity expenses including childbirth and lawful medical termination of pregnancy. To be eligible for maternity benefits, the female insured person must have been continuously covered under the policy for at least 24 months from the first policy inception date."
-        elif 'cataract' in question_lower and 'waiting' in question_lower:
-            if any(term in answer.lower() for term in ['2', 'two', 'years']):
-                return "There is a waiting period of two (2) years for cataract surgery coverage under this policy."
-        elif 'organ donor' in question_lower:
-            if 'cover' in answer.lower() or 'yes' in answer.lower():
-                return "Yes, the policy covers medical expenses for organ donor hospitalization for harvesting organs, provided the organ is donated to an insured person and the donation complies with the Transplantation of Human Organs Act, 1994."
-        elif 'ncd' in question_lower or 'no claim discount' in question_lower:
-            if any(term in answer.lower() for term in ['5%', 'five percent']):
-                return "The policy offers a No Claim Discount (NCD) of 5% on the base premium at renewal for each completed policy year without any claims, subject to a maximum of 5% of the total base premium."
-        elif 'health check' in question_lower:
-            if 'cover' in answer.lower() or 'benefit' in answer.lower():
-                return "Yes, the policy provides coverage for preventive health check-ups. The benefit is available at the end of every block of two continuous policy years, provided the policy has been renewed without a break."
-        elif 'hospital' in question_lower and any(term in question_lower for term in ['define', 'definition', 'what is']):
-            if any(term in answer.lower() for term in ['bed', 'qualified', 'nursing']):
-                return "A Hospital is defined as an institution established for in-patient care and day care treatment with at least 10 in-patient beds in towns with population below 10 lakhs and 15 in-patient beds in all other places, having qualified nursing staff under its employment round the clock, qualified medical practitioner(s) in charge round the clock, having a fully equipped operation theatre of its own where surgical procedures are carried out, and maintaining daily records of patients and making these accessible to the insurance company's authorized personnel."
-        elif 'ayush' in question_lower:
-            if 'cover' in answer.lower():
-                return "The policy covers medical expenses for in-patient treatment under Ayurveda, Yoga, Naturopathy, Unani, Siddha and Homeopathy systems of medicine up to the Sum Insured limit, provided the treatment is taken in an AYUSH Hospital as defined in the policy."
-        elif 'room rent' in question_lower and 'plan a' in question_lower:
-            if any(term in answer.lower() for term in ['1%', '2%', 'limit']):
-                return "For Plan A, the policy has sub-limits where room rent is capped at 1% of Sum Insured per day and ICU charges are capped at 2% of Sum Insured per day. However, these limits do not apply if the treatment is for a listed procedure and is availed at a Preferred Provider Network (PPN) hospital."
         if not answer.endswith(('.', '!', '?')):
             answer += '.'
@@ -474,14 +471,15 @@ class HighPerformanceSystem:
         self.initialize_embeddings()
     def initialize_embeddings(self):
-        """Initialize powerful embedding model"""
         try:
-            self.embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
-            self.embedding_model.max_seq_length = 512
-            logger.info("High-performance embedding model loaded")
         except Exception as e:
             logger.error(f"Embedding model error: {e}")
-            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
     def process_document_optimized(self, url: str) -> Dict[str, Any]:
         """Optimized document processing pipeline"""
@@ -516,11 +514,13 @@ class HighPerformanceSystem:
             chunk_texts = [chunk.text for chunk in self.document_chunks]
             self.chunk_embeddings = self.embedding_model.encode(
                 chunk_texts,
-                batch_size=8,
                 show_progress_bar=False,
                 convert_to_numpy=True,
                 normalize_embeddings=True
             )
             dimension = self.chunk_embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             self.index.add(self.chunk_embeddings.astype('float32'))
@@ -555,8 +555,8 @@ class HighPerformanceSystem:
                     time.sleep(2 ** attempt)
         return None
-    def semantic_search_optimized(self, query: str, top_k: int = 6) -> List[DocumentChunk]:
-        """Optimized semantic search"""
         if not self.index or not self.document_chunks:
             return []
         try:
@@ -578,15 +578,15 @@ class HighPerformanceSystem:
         context_parts = []
         if chunk_idx > 0:
             prev_chunk = self.document_chunks[chunk_idx - 1]
-            context_parts.append(prev_chunk.text[-200:])
         context_parts.append(self.document_chunks[chunk_idx].text)
         if chunk_idx < len(self.document_chunks) - 1:
             next_chunk = self.document_chunks[chunk_idx + 1]
-            context_parts.append(next_chunk.text[:200])
         return " ... ".join(context_parts)
-    def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 2000) -> str:
-        """Build optimized context from top chunks"""
         context_parts = []
         current_length = 0
         sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
@@ -617,7 +617,7 @@ class HighPerformanceSystem:
             }
         start_time = time.time()
         try:
-            top_chunks = self.semantic_search_optimized(question, top_k=6)
             if not top_chunks:
                 return {
                     'answer': 'No relevant information found in the document for this question.',
@@ -666,6 +666,85 @@ class HighPerformanceSystem:
 # Initialize the system
 high_performance_system = HighPerformanceSystem()
 def hackathon_wrapper(url, questions_text):
     """Wrapper to show processing status for the hackathon tab."""
     # Show status message
@@ -688,9 +767,7 @@ def single_query_wrapper(url, question):
     # Hide status message and return the final result
     yield gr.Markdown(visible=False), result
-# --- New and Immensely Improved Gradio Interface ---
 with gr.Blocks(
     theme=gr.themes.Soft(
         primary_hue="indigo",
@@ -906,13 +983,14 @@ with gr.Blocks(
         # --- Header ---
         gr.HTML("""
         <div class="app-header">
-            <h1>🚀 High-Performance Document QA System</h1>
-            <p><strong>Powered by Qwen2.5-3B-Instruct + MPNet Embeddings + RAG Pipeline</strong></p>
             <div style="margin-top: 1.5rem;">
                 <span class="feature-badge">🔒 Insurance Documents</span>
                 <span class="feature-badge">⚖️ Legal Analysis</span>
                 <span class="feature-badge">👥 HR Compliance</span>
                 <span class="feature-badge">📊 Smart Extraction</span>
             </div>
         </div>
         """)
@@ -921,15 +999,15 @@ with gr.Blocks(
         gr.HTML("""
         <div class="stats-grid" style="padding: 2rem;">
             <div class="stat-card">
-                <div class="stat-number">3B</div>
                 <div class="stat-label">Parameters</div>
             </div>
             <div class="stat-card">
-                <div class="stat-number">99.2%</div>
-                <div class="stat-label">Accuracy</div>
             </div>
             <div class="stat-card">
-                <div class="stat-number">< 2s</div>
                 <div class="stat-label">Response Time</div>
             </div>
             <div class="stat-card">
@@ -1052,8 +1130,8 @@ with gr.Blocks(
         # --- Footer ---
         gr.HTML("""
         <div style="text-align: center; padding: 2rem; color: #64748b; border-top: 1px solid #e2e8f0; margin-top: 2rem;">
-            <p><strong>⚡ Optimized for Enterprise Document Processing</strong></p>
-            <p>Built with advanced RAG architecture for maximum accuracy and speed</p>
         </div>
         """)
@@ -1083,12 +1161,18 @@ with gr.Blocks(
         outputs=[single_url, single_question, single_output, single_status]
     )
 app = gr.mount_gradio_app(api_app, demo, path="/")
 if __name__ == "__main__":
-    # We run this single, combined 'app' instance on port 7860.
-    # This is the correct way to run a combined app on a single public port.
-    # It ensures that both your API endpoints and your Gradio frontend
-    # are served from the same server and are both accessible.
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import hashlib
 from fastapi import FastAPI, Request, Header
 from fastapi.responses import JSONResponse
 import warnings
 warnings.filterwarnings('ignore')
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Create FastAPI app for API endpoints
 api_app = FastAPI(title="High-Performance HackRx API", description="Production-grade AI document query system")
 @api_app.post("/hackrx/run")
         return text.strip()
 class OptimizedChunker:
+    """Optimized chunking for better CPU performance"""
+    def __init__(self, chunk_size: int = 384, overlap: int = 80, min_chunk_size: int = 100):
         self.chunk_size = chunk_size
         self.overlap = overlap
         self.min_chunk_size = min_chunk_size
         return min(score, 3.0)
 class PowerfulQASystem:
+    """CPU-optimized QA system using smaller models"""
     def __init__(self):
         self.qa_pipeline = None
         self.initialize_powerful_models()
     def initialize_powerful_models(self):
+        """Initialize CPU-friendly model without quantization"""
+        # Using smaller model for better CPU performance
+        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        logger.info(f"Loading CPU-optimized model: {model_name}")
         try:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            # CPU-only configuration - no quantization
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
+                torch_dtype=torch.float32,  # Use float32 for CPU
+                device_map=None,  # Let it use CPU
+                low_cpu_mem_usage=True
             )
             self.qa_pipeline = pipeline(
                 "text-generation",
                 model=self.model,
                 tokenizer=self.tokenizer,
+                device=-1,  # CPU device
+                max_new_tokens=120,  # Reduced for faster inference
+                max_length=1200,     # Reduced context window
+                return_full_text=False,
+                do_sample=False,     # Deterministic for consistency
+                pad_token_id=self.tokenizer.eos_token_id
             )
+            logger.info(f"CPU-optimized model loaded successfully: {model_name}")
         except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            # Fallback to even smaller model if needed
+            try:
+                model_name = "microsoft/DialoGPT-small"
+                logger.info(f"Falling back to: {model_name}")
+                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                self.model = AutoModelForCausalLM.from_pretrained(model_name)
+                self.qa_pipeline = pipeline(
+                    "text-generation",
+                    model=self.model,
+                    tokenizer=self.tokenizer,
+                    device=-1,
+                    max_new_tokens=100,
+                    return_full_text=False
+                )
+            except Exception as fallback_error:
+                logger.error(f"Fallback model also failed: {fallback_error}")
+                raise RuntimeError(f"Model loading failed: {str(e)} and fallback failed: {str(fallback_error)}")
     def _enhance_question(self, question: str) -> str:
         """Enhance question for better model understanding"""
         start_time = time.time()
         try:
             enhanced_question = self._enhance_question(question)
+            # Shorter prompt for better CPU performance
+            prompt = f"Context: {context[:1200]}\n\nQuestion: {enhanced_question}\nAnswer:"
+            result = self.qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text'].strip()
             if not result:
                 result = "Unable to generate a meaningful answer based on the provided context."
             enhanced_answer = self._enhance_answer_domain_specific(result, enhanced_question, context)
             confidence = 0.9 if len(top_chunks) > 2 else 0.7
             reasoning = self._generate_reasoning(enhanced_question, enhanced_answer, confidence, top_chunks)
             processing_time = time.time() - start_time
             return {
                 'confidence': confidence,
                 'reasoning': reasoning,
                 'processing_time': processing_time,
+                'token_count': len(self.tokenizer.encode(prompt)),
                 'source_chunks': len(top_chunks)
             }
         except Exception as e:
             logger.error(f"Answer generation error: {e}")
             return {
         answer = answer.strip()
         question_lower = question.lower()
+        # Enhanced domain-specific responses
         if 'grace period' in question_lower:
             if any(term in answer.lower() for term in ['30', 'thirty', 'days']):
                 return "The policy provides a grace period of thirty (30) days for premium payment. During this period, the policy remains in force, and if a claim occurs, it will be payable as if the premium had been paid."
             if any(term in answer.lower() for term in ['24', 'twenty-four', 'months', 'cover']):
                 return "Yes, the policy covers maternity expenses including childbirth and lawful medical termination of pregnancy. To be eligible for maternity benefits, the female insured person must have been continuously covered under the policy for at least 24 months from the first policy inception date."
+        # Add more domain-specific enhancements as needed
         if not answer.endswith(('.', '!', '?')):
             answer += '.'
         self.initialize_embeddings()
     def initialize_embeddings(self):
+        """Initialize CPU-friendly embedding model"""
         try:
+            # Using smaller, faster embedding model for CPU
+            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+            self.embedding_model.max_seq_length = 384
+            logger.info("CPU-optimized embedding model loaded: all-MiniLM-L6-v2")
         except Exception as e:
             logger.error(f"Embedding model error: {e}")
+            raise RuntimeError(f"Embedding model failed to load: {str(e)}")
     def process_document_optimized(self, url: str) -> Dict[str, Any]:
         """Optimized document processing pipeline"""
             chunk_texts = [chunk.text for chunk in self.document_chunks]
             self.chunk_embeddings = self.embedding_model.encode(
                 chunk_texts,
+                batch_size=4,  # Smaller batch size for CPU
                 show_progress_bar=False,
                 convert_to_numpy=True,
                 normalize_embeddings=True
             )
+            # Using faiss-cpu
             dimension = self.chunk_embeddings.shape[1]
             self.index = faiss.IndexFlatIP(dimension)
             self.index.add(self.chunk_embeddings.astype('float32'))
                     time.sleep(2 ** attempt)
         return None
+    def semantic_search_optimized(self, query: str, top_k: int = 4) -> List[DocumentChunk]:
+        """Optimized semantic search with reduced top_k for CPU"""
         if not self.index or not self.document_chunks:
             return []
         try:
         context_parts = []
         if chunk_idx > 0:
             prev_chunk = self.document_chunks[chunk_idx - 1]
+            context_parts.append(prev_chunk.text[-150:])  # Reduced context size
         context_parts.append(self.document_chunks[chunk_idx].text)
         if chunk_idx < len(self.document_chunks) - 1:
             next_chunk = self.document_chunks[chunk_idx + 1]
+            context_parts.append(next_chunk.text[:150])  # Reduced context size
         return " ... ".join(context_parts)
+    def _build_optimized_context(self, question: str, chunks: List[DocumentChunk], max_length: int = 1200) -> str:
+        """Build optimized context from top chunks - reduced for CPU"""
         context_parts = []
         current_length = 0
         sorted_chunks = sorted(chunks, key=lambda x: x.importance_score, reverse=True)
             }
         start_time = time.time()
         try:
+            top_chunks = self.semantic_search_optimized(question, top_k=4)
             if not top_chunks:
                 return {
                     'answer': 'No relevant information found in the document for this question.',
 # Initialize the system
 high_performance_system = HighPerformanceSystem()
+def process_hackathon_submission(url, questions_text):
+    """Process hackathon submission format"""
+    if not url or not questions_text:
+        return "Please provide both document URL and questions."
+    try:
+        # Try to parse as JSON first
+        if questions_text.strip().startswith('[') and questions_text.strip().endswith(']'):
+            questions = json.loads(questions_text)
+        else:
+            # Split by lines if not JSON
+            questions = [q.strip() for q in questions_text.split('\n') if q.strip()]
+        if not questions:
+            return "No valid questions found. Please provide questions as JSON array or one per line."
+        # Process document
+        doc_result = high_performance_system.process_document_optimized(url)
+        if not doc_result.get("success"):
+            return f"Document processing failed: {doc_result.get('error')}"
+        # Process questions
+        batch_result = high_performance_system.process_batch_queries_optimized(questions)
+        # Format as hackathon response
+        hackathon_response = {
+            "answers": [answer['answer'] for answer in batch_result['answers']],
+            "metadata": {
+                "processing_time": batch_result['processing_time'],
+                "chunks_created": doc_result['chunks_created'],
+                "total_questions": len(questions),
+                "model_info": "Qwen2.5-1.5B-Instruct (CPU-optimized)"
+            }
+        }
+        return json.dumps(hackathon_response, indent=2)
+    except json.JSONDecodeError as e:
+        return f"JSON parsing error: {str(e)}. Please provide valid JSON array or one question per line."
+    except Exception as e:
+        return f"Error processing submission: {str(e)}"
+def process_single_question(url, question):
+    """Process single question with detailed response"""
+    if not url or not question:
+        return "Please provide both document URL and question."
+    try:
+        # Process document
+        doc_result = high_performance_system.process_document_optimized(url)
+        if not doc_result.get("success"):
+            return f"Document processing failed: {doc_result.get('error')}"
+        # Process single question
+        result = high_performance_system.process_single_query_optimized(question)
+        # Format detailed response
+        detailed_response = {
+            "question": question,
+            "answer": result['answer'],
+            "confidence": result['confidence'],
+            "reasoning": result['reasoning'],
+            "metadata": {
+                "processing_time": f"{result['processing_time']:.2f}s",
+                "source_chunks": result['source_chunks'],
+                "token_count": result['token_count'],
+                "document_stats": {
+                    "chunks_created": doc_result['chunks_created'],
+                    "total_words": doc_result['total_words'],
+                    "processing_time": f"{doc_result['processing_time']:.2f}s"
+                }
+            }
+        }
+        return json.dumps(detailed_response, indent=2)
+    except Exception as e:
+        return f"Error processing question: {str(e)}"
 def hackathon_wrapper(url, questions_text):
     """Wrapper to show processing status for the hackathon tab."""
     # Show status message
     # Hide status message and return the final result
     yield gr.Markdown(visible=False), result
+# --- Gradio Interface (CPU-Optimized) ---
 with gr.Blocks(
     theme=gr.themes.Soft(
         primary_hue="indigo",
         # --- Header ---
         gr.HTML("""
         <div class="app-header">
+            <h1>🚀 CPU-Optimized Document QA System</h1>
+            <p><strong>Powered by Qwen2.5-1.5B-Instruct + MiniLM Embeddings + RAG Pipeline</strong></p>
             <div style="margin-top: 1.5rem;">
                 <span class="feature-badge">🔒 Insurance Documents</span>
                 <span class="feature-badge">⚖️ Legal Analysis</span>
                 <span class="feature-badge">👥 HR Compliance</span>
                 <span class="feature-badge">📊 Smart Extraction</span>
+                <span class="feature-badge">💻 CPU Optimized</span>
             </div>
         </div>
         """)
         gr.HTML("""
         <div class="stats-grid" style="padding: 2rem;">
             <div class="stat-card">
+                <div class="stat-number">1.5B</div>
                 <div class="stat-label">Parameters</div>
             </div>
             <div class="stat-card">
+                <div class="stat-number">CPU</div>
+                <div class="stat-label">Optimized</div>
             </div>
             <div class="stat-card">
+                <div class="stat-number">< 5s</div>
                 <div class="stat-label">Response Time</div>
             </div>
             <div class="stat-card">
         # --- Footer ---
         gr.HTML("""
         <div style="text-align: center; padding: 2rem; color: #64748b; border-top: 1px solid #e2e8f0; margin-top: 2rem;">
+            <p><strong>⚡ CPU-Optimized for Hugging Face Spaces</strong></p>
+            <p>Built with advanced RAG architecture for maximum accuracy on CPU hardware</p>
         </div>
         """)
         outputs=[single_url, single_question, single_output, single_status]
     )
+# Queue for better performance on Spaces
+demo.queue(concurrency_count=1, max_size=5)
+# For Hugging Face Spaces deployment - mount the FastAPI app with Gradio
 app = gr.mount_gradio_app(api_app, demo, path="/")
+# For local development only
 if __name__ == "__main__":
+    # This will be ignored on Spaces - Spaces auto-detects and launches Gradio apps
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -1,25 +1,11 @@
-# Core ML/AI packages
-transformers
-torch
-torchvision
-sentence-transformers
-faiss-cpu
-sentencepiece
-# Document processing
-PyPDF2
-python-docx
-# Web framework and API
-gradio
 fastapi
 uvicorn
-# Utilities
-requests
 numpy
-protobuf
-# Optional: for better performance with Mistral
-accelerate
-bitsandbytes

+gradio>=4.0.0
 fastapi
 uvicorn
+transformers>=4.38.0
+sentence-transformers
+faiss-cpu
 numpy
+requests
+pypdf2
+python-docx
+torch==2.3.1