Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Hugging Face Spaces Backend - Qwen 1.5B Instruct | |
| Leicht, schnell und speichereffizient | |
| """ | |
| import os | |
| import json | |
| import logging | |
| import gradio as gr | |
| from pathlib import Path | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| pipeline | |
| ) | |
| import time | |
| # Logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Configuration | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MAX_TOKENS = 512 | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.9 | |
| # Auto-Select best model for available memory | |
| def select_model(): | |
| """Nutze Qwen 1.5B - klein und schnell!""" | |
| return "Qwen/Qwen2.5-1.5B-Instruct" | |
| return "TheBloke/zephyr-7B-beta-AWQ" | |
| MODEL_NAME = os.getenv("MODEL_NAME", select_model()) | |
| logger.info(f"📌 Using model: {MODEL_NAME}") | |
| # Plugin System | |
| PLUGIN_DIR = Path("plugins") | |
| loaded_plugins = {} | |
| def load_plugins(): | |
| """Load Python plugins from plugins directory""" | |
| if not PLUGIN_DIR.exists(): | |
| logger.warning(f"Plugin directory {PLUGIN_DIR} does not exist") | |
| return | |
| for plugin_file in PLUGIN_DIR.glob("*.py"): | |
| if plugin_file.name.startswith("__"): | |
| continue | |
| try: | |
| plugin_name = plugin_file.stem | |
| logger.info(f"Loading plugin: {plugin_name}") | |
| loaded_plugins[plugin_name] = plugin_file | |
| logger.info(f"✓ Plugin {plugin_name} loaded") | |
| except Exception as e: | |
| logger.error(f"✗ Failed to load plugin {plugin_file}: {e}") | |
| def call_plugin_hook(hook_name, *args, **kwargs): | |
| """Call a plugin hook if it exists""" | |
| for plugin_name, plugin_file in loaded_plugins.items(): | |
| try: | |
| pass | |
| except Exception as e: | |
| logger.error(f"Error calling hook in {plugin_name}: {e}") | |
| # Initialize Model mit Quantization | |
| logger.info(f"⏳ Loading model {MODEL_NAME} on {DEVICE}...") | |
| def load_model_optimized(): | |
| """Qwen 1.5B - kein Quantization nötig, ist schon klein!""" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
| ) | |
| logger.info(f"✅ {MODEL_NAME} loaded successfully") | |
| return tokenizer, model | |
| try: | |
| tokenizer, model = load_model_optimized() | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=0 if DEVICE == "cuda" else -1, | |
| ) | |
| logger.info("✅ Model loaded successfully") | |
| load_in_8bit=False, | |
| ) | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=0 if DEVICE == "cuda" else -1, | |
| ) | |
| logger.info("✓ Model loaded successfully") | |
| except Exception as e: | |
| logger.error(f"✗ Failed to load model: {e}") | |
| raise | |
| def generate_response(prompt: str, system_prompt: str = None) -> dict: | |
| """ | |
| Generate response using Zephyr model | |
| Args: | |
| prompt: User input | |
| system_prompt: Optional system prompt | |
| Returns: | |
| dict with response, tokens, and timing | |
| """ | |
| try: | |
| start_time = time.time() | |
| # Qwen message format | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| messages.append({"role": "user", "content": prompt}) | |
| # Generate | |
| outputs = pipe( | |
| messages, | |
| max_new_tokens=MAX_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=TOP_P, | |
| do_sample=True, | |
| return_full_text=False, | |
| ) | |
| response_text = outputs[0]["generated_text"].strip() | |
| elapsed = time.time() - start_time | |
| result = { | |
| "response": response_text, | |
| "tokens": len(tokenizer.encode(response_text)), | |
| "time_seconds": round(elapsed, 2), | |
| "model": MODEL_NAME, | |
| } | |
| logger.info(f"Generated {result['tokens']} tokens in {result['time_seconds']}s") | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error generating response: {e}") | |
| return { | |
| "response": f"Error: {str(e)}", | |
| "tokens": 0, | |
| "time_seconds": 0, | |
| "error": True, | |
| } | |
| # Load plugins | |
| load_plugins() | |
| # Gradio Interface | |
| with gr.Blocks(title="Zephyr-7B AI Chatbot") as demo: | |
| gr.Markdown("# 🤖 Zephyr-7B Inference Server") | |
| gr.Markdown("Powered by Hugging Face & Gradio") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| placeholder="Optional: Definiere die Rolle des Assistenten...", | |
| lines=3, | |
| ) | |
| with gr.Column(scale=1): | |
| temperature = gr.Slider( | |
| label="Temperature", | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=TEMPERATURE, | |
| step=0.1, | |
| ) | |
| top_p = gr.Slider( | |
| label="Top P", | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=TOP_P, | |
| step=0.05, | |
| ) | |
| user_input = gr.Textbox( | |
| label="Your Message", | |
| placeholder="Type your message...", | |
| lines=4, | |
| ) | |
| submit_btn = gr.Button("🚀 Generate", variant="primary", size="lg") | |
| output_response = gr.Textbox( | |
| label="Response", | |
| interactive=False, | |
| lines=6, | |
| ) | |
| output_stats = gr.JSON(label="Statistics", interactive=False) | |
| def process_input(prompt, system, temp, top): | |
| old_temp = globals()["TEMPERATURE"] | |
| old_top = globals()["TOP_P"] | |
| globals()["TEMPERATURE"] = temp | |
| globals()["TOP_P"] = top | |
| result = generate_response(prompt, system) | |
| globals()["TEMPERATURE"] = old_temp | |
| globals()["TOP_P"] = old_top | |
| stats = { | |
| "Tokens": result["tokens"], | |
| "Time (s)": result["time_seconds"], | |
| "Model": result["model"], | |
| } | |
| return result["response"], stats | |
| submit_btn.click( | |
| process_input, | |
| inputs=[user_input, system_prompt, temperature, top_p], | |
| outputs=[output_response, output_stats], | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["Was ist Machine Learning?", "Du bist ein hilfsbereiter AI-Assistent.", 0.7, 0.9], | |
| ["Schreibe einen kurzen Witz", "", 0.9, 0.95], | |
| ["Erkläre Quantencomputing in einfachen Worten", "Du bist ein Physik-Professor.", 0.7, 0.9], | |
| ], | |
| inputs=[user_input, system_prompt, temperature, top_p], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| ) | |