Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Gradio interface for Buildsnpper Chatbot. | |
| Deployed as a HuggingFace Space with ZeroGPU and 4-bit quantization. | |
| """ | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from sentence_transformers import SentenceTransformer | |
| import spaces | |
| import numpy as np | |
| # Configuration | |
| MODEL_REPO = "bricksandbotltd/buildsnpper-chatbot-merged" | |
| DOCS_URL = "https://huggingface.co/spaces/bricksandbot/assessor-platform-chat/raw/main/platform_functionality_guide.md" | |
| # 4-bit quantization config | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| # Initialize model and tokenizer | |
| print("Loading model and tokenizer with 4-bit quantization...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_REPO, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| model.eval() | |
| print("Model loaded successfully!") | |
| # Initialize RAG components | |
| print("Loading RAG components...") | |
| embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| # Load and chunk documentation | |
| def load_documentation(): | |
| """Load and chunk the platform documentation for RAG.""" | |
| import urllib.request | |
| try: | |
| with urllib.request.urlopen(DOCS_URL) as response: | |
| docs = response.read().decode('utf-8') | |
| except: | |
| # Fallback to empty if can't fetch | |
| print("Warning: Could not fetch documentation, RAG disabled") | |
| return [], [] | |
| # Split into sections by headers | |
| sections = [] | |
| current_section = [] | |
| for line in docs.split('\n'): | |
| if line.startswith('## ') or line.startswith('### '): | |
| if current_section: | |
| sections.append('\n'.join(current_section)) | |
| current_section = [line] | |
| else: | |
| current_section.append(line) | |
| if current_section: | |
| sections.append('\n'.join(current_section)) | |
| # Create embeddings for each section | |
| embeddings = embedding_model.encode(sections, show_progress_bar=False) | |
| print(f"Loaded {len(sections)} documentation sections") | |
| return sections, embeddings | |
| doc_sections, doc_embeddings = load_documentation() | |
| print("RAG components ready!") | |
| def retrieve_relevant_docs(query, top_k=3): | |
| """Retrieve the most relevant documentation sections for a query.""" | |
| if len(doc_sections) == 0: | |
| return "" | |
| # Encode the query | |
| query_embedding = embedding_model.encode([query], show_progress_bar=False)[0] | |
| # Calculate cosine similarity with all doc sections | |
| similarities = np.dot(doc_embeddings, query_embedding) / ( | |
| np.linalg.norm(doc_embeddings, axis=1) * np.linalg.norm(query_embedding) | |
| ) | |
| # Get top-k most similar sections | |
| top_indices = np.argsort(similarities)[-top_k:][::-1] | |
| # Combine top sections | |
| relevant_docs = "\n\n".join([doc_sections[i] for i in top_indices]) | |
| return relevant_docs | |
| def chat(message, history): | |
| """ | |
| Process user message and generate streaming response using ZeroGPU. | |
| Args: | |
| message: User's input message | |
| history: List of [user_msg, bot_msg] pairs | |
| Yields: | |
| str: Streaming bot's response | |
| """ | |
| # Retrieve relevant documentation | |
| relevant_context = retrieve_relevant_docs(message, top_k=3) | |
| # Build conversation history with system prompt + RAG context | |
| system_content = """You are the Buildsnpper SAP Assessor Platform support assistant. | |
| CRITICAL RULES - YOU MUST FOLLOW THESE EXACTLY: | |
| 1. ONLY use information EXPLICITLY STATED in the DOCUMENTATION below | |
| 2. DO NOT add any information from your general knowledge | |
| 3. DO NOT invent or assume any features, fields, terminology, or capabilities | |
| 4. DO NOT use terms that don't appear in the documentation (e.g., if documentation says "credits" don't say "assessor credits") | |
| 5. If information is not in the documentation, respond: "I don't have information about that in the Buildsnpper documentation" | |
| 6. Use EXACT terminology from the documentation - don't paraphrase or create new terms | |
| 7. Only mention fields, buttons, and steps that are EXPLICITLY listed in the documentation | |
| 8. If you're not 100% certain something is in the documentation below, DON'T mention it | |
| DOCUMENTATION PROVIDED: | |
| """ | |
| if relevant_context: | |
| system_content += f"\n{relevant_context}\n" | |
| else: | |
| system_content += "\n(No specific documentation retrieved for this query)\n" | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_content | |
| } | |
| ] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| # Format with chat template | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Tokenize | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| # Generate response with streaming | |
| from transformers import TextIteratorStreamer | |
| from threading import Thread | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True | |
| ) | |
| generation_kwargs = dict( | |
| inputs, | |
| max_new_tokens=300, | |
| temperature=0.01, # Very low temperature for more deterministic, factual responses | |
| do_sample=True, | |
| top_p=0.85, # Slightly reduced to limit creative outputs | |
| repetition_penalty=1.1, # Reduce repetition | |
| pad_token_id=tokenizer.eos_token_id, | |
| streamer=streamer, | |
| ) | |
| # Start generation in separate thread | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream the response | |
| partial_response = "" | |
| for new_text in streamer: | |
| partial_response += new_text | |
| yield partial_response | |
| thread.join() | |
| # Example questions | |
| examples = [ | |
| "How do I create a new project in Buildsnpper?", | |
| "Can I transfer credits between clients?", | |
| "How much do credits cost?", | |
| "What happens when a client's subscription expires?", | |
| "How do I assign a subscription to a client?", | |
| "I forgot my password. How can I reset it?", | |
| "How do I download reports?", | |
| "Can multiple people work on the same project?", | |
| ] | |
| # Create Gradio interface | |
| with gr.Blocks(title="Buildsnpper Chatbot", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # Buildsnpper SAP Assessor Platform Chatbot | |
| Ask questions about the Buildsnpper platform, including: | |
| - Project and client management | |
| - Subscriptions and credits | |
| - Platform features and navigation | |
| - Account management | |
| - Technical issues | |
| **Note**: This chatbot is specialized for Buildsnpper platform questions only. | |
| **Powered by**: ZeroGPU for fast inference | |
| """ | |
| ) | |
| chatbot = gr.ChatInterface( | |
| fn=chat, | |
| examples=examples, | |
| title="", | |
| description="", | |
| retry_btn=None, | |
| undo_btn=None, | |
| clear_btn="Clear Chat", | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| **Model**: [bricksandbotltd/buildsnpper-chatbot-merged](https://huggingface.co/bricksandbotltd/buildsnpper-chatbot-merged) | |
| **Base Model**: microsoft/Phi-4-mini-instruct (3.8B parameters) | |
| **Fine-tuned**: LoRA on 89 Buildsnpper Q&A pairs | |
| **Quantization**: 4-bit (NF4) with bitsandbytes | |
| **Acceleration**: ZeroGPU | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |