#!/usr/bin/env python3 """ Hugging Face Spaces Backend - Qwen 1.5B Instruct Leicht, schnell und speichereffizient """ import os import json import logging import gradio as gr from pathlib import Path import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, pipeline ) import time # Logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MAX_TOKENS = 512 TEMPERATURE = 0.7 TOP_P = 0.9 # Auto-Select best model for available memory def select_model(): """Nutze Qwen 1.5B - klein und schnell!""" return "Qwen/Qwen2.5-1.5B-Instruct" return "TheBloke/zephyr-7B-beta-AWQ" MODEL_NAME = os.getenv("MODEL_NAME", select_model()) logger.info(f"📌 Using model: {MODEL_NAME}") # Plugin System PLUGIN_DIR = Path("plugins") loaded_plugins = {} def load_plugins(): """Load Python plugins from plugins directory""" if not PLUGIN_DIR.exists(): logger.warning(f"Plugin directory {PLUGIN_DIR} does not exist") return for plugin_file in PLUGIN_DIR.glob("*.py"): if plugin_file.name.startswith("__"): continue try: plugin_name = plugin_file.stem logger.info(f"Loading plugin: {plugin_name}") loaded_plugins[plugin_name] = plugin_file logger.info(f"✓ Plugin {plugin_name} loaded") except Exception as e: logger.error(f"✗ Failed to load plugin {plugin_file}: {e}") def call_plugin_hook(hook_name, *args, **kwargs): """Call a plugin hook if it exists""" for plugin_name, plugin_file in loaded_plugins.items(): try: pass except Exception as e: logger.error(f"Error calling hook in {plugin_name}: {e}") # Initialize Model mit Quantization logger.info(f"⏳ Loading model {MODEL_NAME} on {DEVICE}...") def load_model_optimized(): """Qwen 1.5B - kein Quantization nötig, ist schon klein!""" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto" if DEVICE == "cuda" else None, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, ) logger.info(f"✅ {MODEL_NAME} loaded successfully") return tokenizer, model try: tokenizer, model = load_model_optimized() pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1, ) logger.info("✅ Model loaded successfully") load_in_8bit=False, ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1, ) logger.info("✓ Model loaded successfully") except Exception as e: logger.error(f"✗ Failed to load model: {e}") raise def generate_response(prompt: str, system_prompt: str = None) -> dict: """ Generate response using Zephyr model Args: prompt: User input system_prompt: Optional system prompt Returns: dict with response, tokens, and timing """ try: start_time = time.time() # Qwen message format messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) # Generate outputs = pipe( messages, max_new_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, do_sample=True, return_full_text=False, ) response_text = outputs[0]["generated_text"].strip() elapsed = time.time() - start_time result = { "response": response_text, "tokens": len(tokenizer.encode(response_text)), "time_seconds": round(elapsed, 2), "model": MODEL_NAME, } logger.info(f"Generated {result['tokens']} tokens in {result['time_seconds']}s") return result except Exception as e: logger.error(f"Error generating response: {e}") return { "response": f"Error: {str(e)}", "tokens": 0, "time_seconds": 0, "error": True, } # Load plugins load_plugins() # Gradio Interface with gr.Blocks(title="Zephyr-7B AI Chatbot") as demo: gr.Markdown("# 🤖 Zephyr-7B Inference Server") gr.Markdown("Powered by Hugging Face & Gradio") with gr.Row(): with gr.Column(scale=2): system_prompt = gr.Textbox( label="System Prompt", placeholder="Optional: Definiere die Rolle des Assistenten...", lines=3, ) with gr.Column(scale=1): temperature = gr.Slider( label="Temperature", minimum=0.0, maximum=2.0, value=TEMPERATURE, step=0.1, ) top_p = gr.Slider( label="Top P", minimum=0.0, maximum=1.0, value=TOP_P, step=0.05, ) user_input = gr.Textbox( label="Your Message", placeholder="Type your message...", lines=4, ) submit_btn = gr.Button("🚀 Generate", variant="primary", size="lg") output_response = gr.Textbox( label="Response", interactive=False, lines=6, ) output_stats = gr.JSON(label="Statistics", interactive=False) def process_input(prompt, system, temp, top): old_temp = globals()["TEMPERATURE"] old_top = globals()["TOP_P"] globals()["TEMPERATURE"] = temp globals()["TOP_P"] = top result = generate_response(prompt, system) globals()["TEMPERATURE"] = old_temp globals()["TOP_P"] = old_top stats = { "Tokens": result["tokens"], "Time (s)": result["time_seconds"], "Model": result["model"], } return result["response"], stats submit_btn.click( process_input, inputs=[user_input, system_prompt, temperature, top_p], outputs=[output_response, output_stats], ) # Examples gr.Examples( examples=[ ["Was ist Machine Learning?", "Du bist ein hilfsbereiter AI-Assistent.", 0.7, 0.9], ["Schreibe einen kurzen Witz", "", 0.9, 0.95], ["Erkläre Quantencomputing in einfachen Worten", "Du bist ein Physik-Professor.", 0.7, 0.9], ], inputs=[user_input, system_prompt, temperature, top_p], ) if __name__ == "__main__": demo.queue().launch( server_name="0.0.0.0", server_port=7860, share=False, )