Spaces:

gmustafa413
/

ChatBot

Build error

App Files Files Community

gmustafa413 commited on Apr 2, 2025

Commit

43c6974

verified ·

1 Parent(s): 159cb34

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -202

app.py CHANGED Viewed

@@ -1,235 +1,318 @@
-!pip install langdetect faiss-cpu transformers gradio groq sentence-transformers pypdf2 python-pptx pandas docx2txt
-import gradio as gr
-import fitz  # PyMuPDF
 import numpy as np
 import requests
-import faiss
-import re
 import json
-import pandas as pd
-from docx import Document
-from pptx import Presentation
 from sentence_transformers import SentenceTransformer
 from concurrent.futures import ThreadPoolExecutor
 # Configuration
 GROQ_API_KEY = "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"  # Replace with your actual key
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Proper embedding model
 CHUNK_SIZE = 512
 MAX_TOKENS = 4096
-WORKERS = 8
-# Initialize the embedding model
-embedding_model = SentenceTransformer(EMBEDDING_MODEL)
-class DocumentProcessor:
     def __init__(self):
-        self.index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
         self.chunks = []
-        self.processor_pool = ThreadPoolExecutor(max_workers=WORKERS)
-    def extract_text_from_pptx(self, file_path):
         try:
-            prs = Presentation(file_path)
-            return " ".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
-        except Exception as e:
-            print(f"PPTX Error: {str(e)}")
-            return ""
-    def extract_text_from_xls_csv(self, file_path):
-        try:
-            if file_path.endswith(('.xls', '.xlsx')):
-                df = pd.read_excel(file_path)
-            else:
-                df = pd.read_csv(file_path)
-            return " ".join(df.astype(str).values.flatten())
         except Exception as e:
-            print(f"Spreadsheet Error: {str(e)}")
-            return ""
-    def extract_text_from_pdf(self, file_path):
         try:
-            doc = fitz.open(file_path)
-            return " ".join(page.get_text("text", flags=fitz.TEXT_PRESERVE_WHITESPACE) for page in doc)
         except Exception as e:
-            print(f"PDF Error: {str(e)}")
-            return ""
-    def process_file(self, file):
-        try:
-            file_path = file.name
-            print(f"Processing: {file_path}")
-            if file_path.endswith('.pdf'):
-                text = self.extract_text_from_pdf(file_path)
-            elif file_path.endswith('.docx'):
-                text = " ".join(p.text for p in Document(file_path).paragraphs)
-            elif file_path.endswith('.txt'):
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    text = f.read()
-            elif file_path.endswith('.pptx'):
-                text = self.extract_text_from_pptx(file_path)
-            elif file_path.endswith(('.xls', '.xlsx', '.csv')):
-                text = self.extract_text_from_xls_csv(file_path)
-            else:
-                return ""
-            clean_text = re.sub(r'\s+', ' ', text).strip()
-            print(f"Extracted {len(clean_text)} characters from {file_path}")
-            return clean_text
-        except Exception as e:
-            print(f"Processing Error: {str(e)}")
-            return ""
-    def semantic_chunking(self, text):
-        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-        chunks = []
-        current_chunk = ""
-        for sentence in sentences:
-            if len(current_chunk) + len(sentence) < CHUNK_SIZE:
-                current_chunk += " " + sentence
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = sentence
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        return chunks[:1000]  # Limit to 1000 chunks per document
-    def process_documents(self, files):
-        self.chunks = []
-        if not files:
-            return "No files uploaded!"
-        print("\n" + "="*40 + " PROCESSING DOCUMENTS " + "="*40)
-        texts = list(self.processor_pool.map(self.process_file, files))
-        with ThreadPoolExecutor(max_workers=WORKERS) as executor:
-            chunk_lists = list(executor.map(self.semantic_chunking, texts))
-        all_chunks = [chunk for chunk_list in chunk_lists for chunk in chunk_list]
-        print(f"Total chunks generated: {len(all_chunks)}")
-        if not all_chunks:
-            return "Error: No chunks generated from documents"
-        try:
-            embeddings = embedding_model.encode(
-                all_chunks,
-                batch_size=32,
-                convert_to_tensor=True,
-                show_progress_bar=False
-            ).cpu().numpy().astype('float32')
-            self.index.reset()
-            self.index.add(embeddings)
-            self.chunks = all_chunks
-            return f"Processed {len(all_chunks)} chunks from {len(files)} files"
-        except Exception as e:
-            print(f"Embedding Error: {str(e)}")
-            return f"Error: {str(e)}"
-    def query(self, question):
-        if not self.chunks:
-            return "Please process documents first", False
-        try:
-            print("\n" + "="*40 + " QUERY PROCESSING " + "="*40)
-            print(f"Question: {question}")
-            question_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().numpy().astype('float32')
-            _, indices = self.index.search(question_embedding, 3)
-            print(f"Top indices: {indices}")
-            context = "\n".join([self.chunks[i] for i in indices[0] if i < len(self.chunks)])
-            print(f"Context length: {len(context)} characters")
-            headers = {
-                "Authorization": f"Bearer {GROQ_API_KEY}",
-                "Content-Type": "application/json"
-            }
-            payload = {
-                "messages": [{
-                    "role": "user",
-                    "content": f"Answer concisely based on the context: {question}\nContext: {context}"
-                }],
-                "model": "mixtral-8x7b-32768",
-                "temperature": 0.3,
-                "max_tokens": MAX_TOKENS,
-                "stream": False  # Changed to False for simpler handling
-            }
-            response = requests.post(
-                "https://api.groq.com/openai/v1/chat/completions",
-                headers=headers,
-                json=payload,
-                timeout=20
-            )
-            print(f"API Status Code: {response.status_code}")
-            if response.status_code != 200:
-                return f"API Error: {response.text}", False
-            data = response.json()
-            final_answer = data.get("choices", [{}])[0].get("message", {}).get("content", "")
-            print(f"Final Answer: {final_answer}")
-            return final_answer, True
-        except Exception as e:
-            print(f"Query Error: {str(e)}")
-            return f"Error: {str(e)}", False
-processor = DocumentProcessor()
-def ask_question(question, chat_history):
-    if not question.strip():
-        return chat_history + [("", "Please enter a valid question")]
-    answer, success = processor.query(question)
-    return chat_history + [(question, answer)]
-with gr.Blocks(title="Document ChatBot") as app:
-    gr.Markdown("## 🚀 Multi-Format Document ChatBot")
-    with gr.Row():
-        files = gr.File(
-            file_count="multiple",
-            file_types=[".pdf", ".docx", ".txt", ".pptx", ".xls", ".xlsx", ".csv"],
-            label="Upload Documents"
         )
-        process_btn = gr.Button("Process Documents", variant="primary")
-    status = gr.Textbox(label="Processing Status", interactive=False)
-    chatbot = gr.Chatbot(height=500, label="Chat History")
     with gr.Row():
-        question = gr.Textbox(
-            label="Your Query",
-            placeholder="Enter your question about the documents...",
-            max_lines=3
-        )
-        ask_btn = gr.Button("Ask", variant="primary")
-    clear_btn = gr.Button("Clear Chat")
-    process_btn.click(
-        fn=processor.process_documents,
-        inputs=files,
-        outputs=status
     )
     ask_btn.click(
-        fn=ask_question,
         inputs=[question, chatbot],
-        outputs=chatbot
     ).then(lambda: "", None, question)
     clear_btn.click(
         fn=lambda: [],
         inputs=None,
-        outputs=chatbot
     )
-app.launch()

+import faiss
 import numpy as np
+import gradio as gr
 import requests
 import json
+import re
+import torch
+from transformers import AutoTokenizer
+from langdetect import detect
 from sentence_transformers import SentenceTransformer
 from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
 # Configuration
 GROQ_API_KEY = "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"  # Replace with your actual key
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json"
 CHUNK_SIZE = 512
 MAX_TOKENS = 4096
+WORKERS = 4
+EMBEDDING_BATCH_SIZE = 32
+# Load the embedding model
+model = SentenceTransformer(MODEL_NAME)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+class UniversityKnowledgeBase:
     def __init__(self):
+        self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
         self.chunks = []
+        self.loaded = False
+        self.total_chunks = 0
+    def load_dataset(self):
+        """Loads and thoroughly processes the University dataset"""
         try:
+            print("\n" + "="*50)
+            print("Loading University of Education, Lahore dataset...")
+            print("="*50 + "\n")
+            # Fetch dataset with error handling
+            response = requests.get(DATASET_URL, timeout=30)
+            if response.status_code != 200:
+                raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}")
+            # Parse JSON content
+            try:
+                data = response.json()
+            except json.JSONDecodeError:
+                raise Exception("Invalid JSON format in dataset")
+            if not isinstance(data, list):
+                raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.")
+            # Process all content with progress tracking
+            self.chunks = []
+            with tqdm(data, desc="Processing dataset") as progress_bar:
+                for item in progress_bar:
+                    if isinstance(item, dict):
+                        if 'question' in item and 'answer' in item:
+                            # Create comprehensive Q&A chunks
+                            self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n")
+                        elif 'text' in item:
+                            # Process text content with semantic chunking
+                            text = item['text'].strip()
+                            if len(text) > CHUNK_SIZE:
+                                sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
+                                current_chunk = ""
+                                for sentence in sentences:
+                                    if len(current_chunk) + len(sentence) < CHUNK_SIZE:
+                                        current_chunk += " " + sentence
+                                    else:
+                                        if current_chunk:
+                                            self.chunks.append(current_chunk.strip())
+                                        current_chunk = sentence
+                                if current_chunk:
+                                    self.chunks.append(current_chunk.strip())
+                            else:
+                                self.chunks.append(text)
+            self.total_chunks = len(self.chunks)
+            if self.total_chunks == 0:
+                raise Exception("No valid content found in the dataset")
+            print(f"\nSuccessfully processed {self.total_chunks} knowledge chunks from dataset")
+            # Generate embeddings in batches with progress tracking
+            print("\nGenerating embeddings...")
+            embeddings = []
+            for i in tqdm(range(0, self.total_chunks, EMBEDDING_BATCH_SIZE),
+                         desc="Creating embeddings",
+                         total=(self.total_chunks//EMBEDDING_BATCH_SIZE)+1):
+                batch = self.chunks[i:i+EMBEDDING_BATCH_SIZE]
+                batch_embeddings = model.encode(
+                    batch,
+                    convert_to_tensor=True,
+                    show_progress_bar=False
+                ).cpu().numpy().astype('float32')
+                embeddings.append(batch_embeddings)
+            # Combine all embeddings and build FAISS index
+            all_embeddings = np.concatenate(embeddings)
+            self.index.add(all_embeddings)
+            self.loaded = True
+            return f"✅ Successfully loaded {self.total_chunks} knowledge chunks from University dataset"
         except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return f"❌ Error loading dataset: {str(e)}"
+    def find_relevant_context(self, query, k=5):
+        """Finds the most relevant context with enhanced retrieval"""
+        if not self.loaded or not self.chunks:
+            return None
         try:
+            # Generate query embedding
+            query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
+            # Search with higher k initially for better context
+            _, indices = self.index.search(query_embedding, k*2)
+            # Get unique chunks (avoid duplicates)
+            unique_indices = list(dict.fromkeys(indices[0]))
+            # Select top-k most relevant unique chunks
+            selected_chunks = []
+            for idx in unique_indices[:k]:
+                if 0 <= idx < len(self.chunks):
+                    selected_chunks.append(self.chunks[idx])
+            return "\n\n---\n\n".join(selected_chunks) if selected_chunks else None
         except Exception as e:
+            print(f"Context retrieval error: {str(e)}")
+            return None
+# Initialize the knowledge base
+knowledge_base = UniversityKnowledgeBase()
+def detect_language(text):
+    """Enhanced language detection with Urdu support"""
+    try:
+        text = text.lower().strip()
+        # Roman Urdu detection
+        roman_urdu_keywords = ['hai', 'ho', 'hain', 'ka', 'ki', 'ke', 'main', 'tum', 'ap', 'kyun', 'kya']
+        if any(keyword in text for keyword in roman_urdu_keywords):
+            return "Roman Urdu"
+        # Standard detection
+        lang = detect(text)
+        if lang == "ur":
+            return "Urdu"
+        elif lang == "hi":  # Hindi/Urdu handling
+            return "Urdu" if not text.isascii() else "Roman Urdu"
+        return "English"
+    except:
+        return "English"
+def get_groq_response(context, user_query, language="English"):
+    """Generates accurate responses strictly based on context"""
+    headers = {
+        "Authorization": f"Bearer {GROQ_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    # Language-specific system prompts
+    system_prompts = {
+        "Urdu": """
+        آپ یونیورسٹی آف ایجوکیشن، لاہور کا سرکاری چیٹ بوٹ ہیں۔ درج ذیل معلومات کی بنیاد پر درست جواب دیں۔
+        اگر جواب دستیاب نہ ہو تو کہیں:
+        "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔"
+        """,
+        "Roman Urdu": """
+        Aap University of Education, Lahore ka chatbot hain. Diye gaye context ke hisab se jawab dein.
+        Agar jawab nahin mila to kehain:
+        "Maazrat, yeh maloomat mojood nahin. University ki website check karein."
+        """,
+        "English": """
+        You are the official chatbot of University of Education, Lahore.
+        Answer STRICTLY based on the provided context. If the answer isn't available, say:
+        "I'm sorry, this information isn't available. Please check the university website."
+        """
+    }
+    payload = {
+        "model": "mixtral-8x7b-32768",
+        "messages": [
+            {"role": "system", "content": system_prompts.get(language, system_prompts["English"])},
+            {"role": "user", "content": f"University Context:\n{context}\n\nQuestion: {user_query}"}
+        ],
+        "temperature": 0.1,  # Low temperature for factual accuracy
+        "max_tokens": MAX_TOKENS,
+        "top_p": 0.9
+    }
+    try:
+        response = requests.post(
+            "https://api.groq.com/openai/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=30
         )
+        if response.status_code != 200:
+            print(f"API Error {response.status_code}: {response.text[:200]}")
+            return None
+        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "")
+    except Exception as e:
+        print(f"API Request Failed: {str(e)}")
+        return None
+def chatbot_response(user_input, chat_history):
+    """Handles user queries with comprehensive response generation"""
+    if not user_input.strip():
+        return chat_history + [(user_input, "Please enter a valid question.")]
+    # Detect language
+    language = detect_language(user_input)
+    # Retrieve relevant context (more chunks for better accuracy)
+    context = knowledge_base.find_relevant_context(user_input, k=5)
+    # Handle no context found
+    if not context:
+        error_messages = {
+            "Urdu": "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔",
+            "Roman Urdu": "Maazrat, yeh maloomat mojood nahin. University ki website check karein.",
+            "English": "I'm sorry, this information isn't available. Please check the university website."
+        }
+        return chat_history + [(user_input, error_messages.get(language, error_messages["English"]))]
+    # Generate response
+    response = get_groq_response(context, user_input, language)
+    # Fallback if API fails
+    if not response:
+        fallback_messages = {
+            "Urdu": "معذرت، نظام میں عارضی خرابی ہے۔ بعد میں کوشش کریں۔",
+            "Roman Urdu": "Maazrat, system mein masla hai. Baad mein koshish karein.",
+            "English": "Sorry, there's a temporary system issue. Please try again later."
+        }
+        response = fallback_messages.get(language, fallback_messages["English"])
+    return chat_history + [(user_input, response)]
+# Gradio Interface
+with gr.Blocks(title="University of Education ChatBot", theme=gr.themes.Soft()) as app:
+    gr.Markdown("""
+    <div style='text-align: center;'>
+        <h1>University of Education, Lahore</h1>
+        <h2>Official Information ChatBot</h2>
+        <p>Ask any question about the university in English, Urdu, or Roman Urdu</p>
+    </div>
+    """)
+    # Initialize dataset
+    load_status = knowledge_base.load_dataset()
     with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Knowledge Base Status")
+            status = gr.Textbox(
+                label="Dataset Status",
+                value=load_status,
+                interactive=False,
+                lines=2
+            )
+            reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary")
+            gr.Markdown("""
+            **Note:** This chatbot answers strictly based on the official University of Education, Lahore dataset.
+            """)
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(
+                height=500,
+                label="Conversation History",
+                bubble_full_width=False
+            )
+            question = gr.Textbox(
+                label="Your Question",
+                placeholder="Type your question about the university...",
+                lines=2,
+                max_lines=5
+            )
+            with gr.Row():
+                ask_btn = gr.Button("Ask Question", variant="primary")
+                clear_btn = gr.Button("Clear Conversation", variant="secondary")
+    # Event handlers
+    reload_btn.click(
+        fn=lambda: knowledge_base.load_dataset(),
+        inputs=None,
+        outputs=status,
+        queue=False
     )
     ask_btn.click(
+        fn=chatbot_response,
         inputs=[question, chatbot],
+        outputs=chatbot,
+        queue=True
     ).then(lambda: "", None, question)
     clear_btn.click(
         fn=lambda: [],
         inputs=None,
+        outputs=chatbot,
+        queue=False
     )
+# Launch the application
+if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0", server_port=7860)