Spaces:
Build error
Build error
| import faiss | |
| import numpy as np | |
| import gradio as gr | |
| import requests | |
| import json | |
| import re | |
| import torch | |
| from transformers import AutoTokenizer | |
| from langdetect import detect | |
| from sentence_transformers import SentenceTransformer | |
| from concurrent.futures import ThreadPoolExecutor | |
| from tqdm import tqdm | |
| # Configuration | |
| GROQ_API_KEY = "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" # Replace with your actual key | |
| MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| DATASET_URL = "https://huggingface.co/datasets/midrees2806/7K_Dataset/resolve/main/University_of_Education_Lahore_FAQ.json" | |
| CHUNK_SIZE = 512 | |
| MAX_TOKENS = 4096 | |
| WORKERS = 4 | |
| EMBEDDING_BATCH_SIZE = 32 | |
| # Load the embedding model | |
| model = SentenceTransformer(MODEL_NAME) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| class UniversityKnowledgeBase: | |
| def __init__(self): | |
| self.index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension()) | |
| self.chunks = [] | |
| self.loaded = False | |
| self.total_chunks = 0 | |
| def load_dataset(self): | |
| """Loads and thoroughly processes the University dataset""" | |
| try: | |
| print("\n" + "="*50) | |
| print("Loading University of Education, Lahore dataset...") | |
| print("="*50 + "\n") | |
| # Fetch dataset with error handling | |
| response = requests.get(DATASET_URL, timeout=30) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to fetch dataset. HTTP Status: {response.status_code}") | |
| # Parse JSON content | |
| try: | |
| data = response.json() | |
| except json.JSONDecodeError: | |
| raise Exception("Invalid JSON format in dataset") | |
| if not isinstance(data, list): | |
| raise Exception("Dataset format is invalid. Expected a list of Q&A pairs.") | |
| # Process all content with progress tracking | |
| self.chunks = [] | |
| with tqdm(data, desc="Processing dataset") as progress_bar: | |
| for item in progress_bar: | |
| if isinstance(item, dict): | |
| if 'question' in item and 'answer' in item: | |
| # Create comprehensive Q&A chunks | |
| self.chunks.append(f"QUESTION: {item['question'].strip()}\nANSWER: {item['answer'].strip()}\n") | |
| elif 'text' in item: | |
| # Process text content with semantic chunking | |
| text = item['text'].strip() | |
| if len(text) > CHUNK_SIZE: | |
| sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) < CHUNK_SIZE: | |
| current_chunk += " " + sentence | |
| else: | |
| if current_chunk: | |
| self.chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| self.chunks.append(current_chunk.strip()) | |
| else: | |
| self.chunks.append(text) | |
| self.total_chunks = len(self.chunks) | |
| if self.total_chunks == 0: | |
| raise Exception("No valid content found in the dataset") | |
| print(f"\nSuccessfully processed {self.total_chunks} knowledge chunks from dataset") | |
| # Generate embeddings in batches with progress tracking | |
| print("\nGenerating embeddings...") | |
| embeddings = [] | |
| for i in tqdm(range(0, self.total_chunks, EMBEDDING_BATCH_SIZE), | |
| desc="Creating embeddings", | |
| total=(self.total_chunks//EMBEDDING_BATCH_SIZE)+1): | |
| batch = self.chunks[i:i+EMBEDDING_BATCH_SIZE] | |
| batch_embeddings = model.encode( | |
| batch, | |
| convert_to_tensor=True, | |
| show_progress_bar=False | |
| ).cpu().numpy().astype('float32') | |
| embeddings.append(batch_embeddings) | |
| # Combine all embeddings and build FAISS index | |
| all_embeddings = np.concatenate(embeddings) | |
| self.index.add(all_embeddings) | |
| self.loaded = True | |
| return f"✅ Successfully loaded {self.total_chunks} knowledge chunks from University dataset" | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return f"❌ Error loading dataset: {str(e)}" | |
| def find_relevant_context(self, query, k=5): | |
| """Finds the most relevant context with enhanced retrieval""" | |
| if not self.loaded or not self.chunks: | |
| return None | |
| try: | |
| # Generate query embedding | |
| query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32') | |
| # Search with higher k initially for better context | |
| _, indices = self.index.search(query_embedding, k*2) | |
| # Get unique chunks (avoid duplicates) | |
| unique_indices = list(dict.fromkeys(indices[0])) | |
| # Select top-k most relevant unique chunks | |
| selected_chunks = [] | |
| for idx in unique_indices[:k]: | |
| if 0 <= idx < len(self.chunks): | |
| selected_chunks.append(self.chunks[idx]) | |
| return "\n\n---\n\n".join(selected_chunks) if selected_chunks else None | |
| except Exception as e: | |
| print(f"Context retrieval error: {str(e)}") | |
| return None | |
| # Initialize the knowledge base | |
| knowledge_base = UniversityKnowledgeBase() | |
| def detect_language(text): | |
| """Enhanced language detection with Urdu support""" | |
| try: | |
| text = text.lower().strip() | |
| # Roman Urdu detection | |
| roman_urdu_keywords = ['hai', 'ho', 'hain', 'ka', 'ki', 'ke', 'main', 'tum', 'ap', 'kyun', 'kya'] | |
| if any(keyword in text for keyword in roman_urdu_keywords): | |
| return "Roman Urdu" | |
| # Standard detection | |
| lang = detect(text) | |
| if lang == "ur": | |
| return "Urdu" | |
| elif lang == "hi": # Hindi/Urdu handling | |
| return "Urdu" if not text.isascii() else "Roman Urdu" | |
| return "English" | |
| except: | |
| return "English" | |
| def get_groq_response(context, user_query, language="English"): | |
| """Generates accurate responses strictly based on context""" | |
| headers = { | |
| "Authorization": f"Bearer {GROQ_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| # Language-specific system prompts | |
| system_prompts = { | |
| "Urdu": """ | |
| آپ یونیورسٹی آف ایجوکیشن، لاہور کا سرکاری چیٹ بوٹ ہیں۔ درج ذیل معلومات کی بنیاد پر درست جواب دیں۔ | |
| اگر جواب دستیاب نہ ہو تو کہیں: | |
| "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔" | |
| """, | |
| "Roman Urdu": """ | |
| Aap University of Education, Lahore ka chatbot hain. Diye gaye context ke hisab se jawab dein. | |
| Agar jawab nahin mila to kehain: | |
| "Maazrat, yeh maloomat mojood nahin. University ki website check karein." | |
| """, | |
| "English": """ | |
| You are the official chatbot of University of Education, Lahore. | |
| Answer STRICTLY based on the provided context. If the answer isn't available, say: | |
| "I'm sorry, this information isn't available. Please check the university website." | |
| """ | |
| } | |
| payload = { | |
| "model": "mixtral-8x7b-32768", | |
| "messages": [ | |
| {"role": "system", "content": system_prompts.get(language, system_prompts["English"])}, | |
| {"role": "user", "content": f"University Context:\n{context}\n\nQuestion: {user_query}"} | |
| ], | |
| "temperature": 0.1, # Low temperature for factual accuracy | |
| "max_tokens": MAX_TOKENS, | |
| "top_p": 0.9 | |
| } | |
| try: | |
| response = requests.post( | |
| "https://api.groq.com/openai/v1/chat/completions", | |
| headers=headers, | |
| json=payload, | |
| timeout=30 | |
| ) | |
| if response.status_code != 200: | |
| print(f"API Error {response.status_code}: {response.text[:200]}") | |
| return None | |
| return response.json().get("choices", [{}])[0].get("message", {}).get("content", "") | |
| except Exception as e: | |
| print(f"API Request Failed: {str(e)}") | |
| return None | |
| def chatbot_response(user_input, chat_history): | |
| """Handles user queries with comprehensive response generation""" | |
| if not user_input.strip(): | |
| return chat_history + [(user_input, "Please enter a valid question.")] | |
| # Detect language | |
| language = detect_language(user_input) | |
| # Retrieve relevant context (more chunks for better accuracy) | |
| context = knowledge_base.find_relevant_context(user_input, k=5) | |
| # Handle no context found | |
| if not context: | |
| error_messages = { | |
| "Urdu": "معذرت، یہ معلومات دستیاب نہیں۔ براہ کرم یونیورسٹی کی ویب سائٹ دیکھیں۔", | |
| "Roman Urdu": "Maazrat, yeh maloomat mojood nahin. University ki website check karein.", | |
| "English": "I'm sorry, this information isn't available. Please check the university website." | |
| } | |
| return chat_history + [(user_input, error_messages.get(language, error_messages["English"]))] | |
| # Generate response | |
| response = get_groq_response(context, user_input, language) | |
| # Fallback if API fails | |
| if not response: | |
| fallback_messages = { | |
| "Urdu": "معذرت، نظام میں عارضی خرابی ہے۔ بعد میں کوشش کریں۔", | |
| "Roman Urdu": "Maazrat, system mein masla hai. Baad mein koshish karein.", | |
| "English": "Sorry, there's a temporary system issue. Please try again later." | |
| } | |
| response = fallback_messages.get(language, fallback_messages["English"]) | |
| return chat_history + [(user_input, response)] | |
| # Gradio Interface | |
| with gr.Blocks(title="University of Education ChatBot", theme=gr.themes.Soft()) as app: | |
| gr.Markdown(""" | |
| <div style='text-align: center;'> | |
| <h1>University of Education, Lahore</h1> | |
| <h2>Official Information ChatBot</h2> | |
| <p>Ask any question about the university in English, Urdu, or Roman Urdu</p> | |
| </div> | |
| """) | |
| # Initialize dataset | |
| load_status = knowledge_base.load_dataset() | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Knowledge Base Status") | |
| status = gr.Textbox( | |
| label="Dataset Status", | |
| value=load_status, | |
| interactive=False, | |
| lines=2 | |
| ) | |
| reload_btn = gr.Button("🔄 Reload Knowledge Base", variant="secondary") | |
| gr.Markdown(""" | |
| **Note:** This chatbot answers strictly based on the official University of Education, Lahore dataset. | |
| """) | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot( | |
| height=500, | |
| label="Conversation History", | |
| bubble_full_width=False | |
| ) | |
| question = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Type your question about the university...", | |
| lines=2, | |
| max_lines=5 | |
| ) | |
| with gr.Row(): | |
| ask_btn = gr.Button("Ask Question", variant="primary") | |
| clear_btn = gr.Button("Clear Conversation", variant="secondary") | |
| # Event handlers | |
| reload_btn.click( | |
| fn=lambda: knowledge_base.load_dataset(), | |
| inputs=None, | |
| outputs=status, | |
| queue=False | |
| ) | |
| ask_btn.click( | |
| fn=chatbot_response, | |
| inputs=[question, chatbot], | |
| outputs=chatbot, | |
| queue=True | |
| ).then(lambda: "", None, question) | |
| clear_btn.click( | |
| fn=lambda: [], | |
| inputs=None, | |
| outputs=chatbot, | |
| queue=False | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| app.launch(server_name="0.0.0.0", server_port=7860) |