Spaces:

LejobuildYT
/

AI_Chatbot

Sleeping

App Files Files Community

AI_Chatbot / app.py

LejobuildYT

Upload 18 files

70b7b2b verified 8 days ago

raw

history blame contribute delete

7.25 kB

	#!/usr/bin/env python3
	"""
	Hugging Face Spaces Backend - Qwen 1.5B Instruct
	Leicht, schnell und speichereffizient
	"""

	import os
	import json
	import logging
	import gradio as gr
	from pathlib import Path
	import torch
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	pipeline
	)
	import time

	# Logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Configuration
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MAX_TOKENS = 512
	TEMPERATURE = 0.7
	TOP_P = 0.9

	# Auto-Select best model for available memory
	def select_model():
	"""Nutze Qwen 1.5B - klein und schnell!"""
	return "Qwen/Qwen2.5-1.5B-Instruct"
	return "TheBloke/zephyr-7B-beta-AWQ"

	MODEL_NAME = os.getenv("MODEL_NAME", select_model())
	logger.info(f"📌 Using model: {MODEL_NAME}")

	# Plugin System
	PLUGIN_DIR = Path("plugins")
	loaded_plugins = {}

	def load_plugins():
	"""Load Python plugins from plugins directory"""
	if not PLUGIN_DIR.exists():
	logger.warning(f"Plugin directory {PLUGIN_DIR} does not exist")
	return

	for plugin_file in PLUGIN_DIR.glob("*.py"):
	if plugin_file.name.startswith("__"):
	continue
	try:
	plugin_name = plugin_file.stem
	logger.info(f"Loading plugin: {plugin_name}")
	loaded_plugins[plugin_name] = plugin_file
	logger.info(f"✓ Plugin {plugin_name} loaded")
	except Exception as e:
	logger.error(f"✗ Failed to load plugin {plugin_file}: {e}")

	def call_plugin_hook(hook_name, args, *kwargs):
	"""Call a plugin hook if it exists"""
	for plugin_name, plugin_file in loaded_plugins.items():
	try:
	pass
	except Exception as e:
	logger.error(f"Error calling hook in {plugin_name}: {e}")

	# Initialize Model mit Quantization
	logger.info(f"⏳ Loading model {MODEL_NAME} on {DEVICE}...")

	def load_model_optimized():
	"""Qwen 1.5B - kein Quantization nötig, ist schon klein!"""
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	device_map="auto" if DEVICE == "cuda" else None,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	)

	logger.info(f"✅ {MODEL_NAME} loaded successfully")
	return tokenizer, model

	try:
	tokenizer, model = load_model_optimized()
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if DEVICE == "cuda" else -1,
	)
	logger.info("✅ Model loaded successfully")
	load_in_8bit=False,
	)

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if DEVICE == "cuda" else -1,
	)
	logger.info("✓ Model loaded successfully")
	except Exception as e:
	logger.error(f"✗ Failed to load model: {e}")
	raise

	def generate_response(prompt: str, system_prompt: str = None) -> dict:
	"""
	Generate response using Zephyr model

	Args:
	prompt: User input
	system_prompt: Optional system prompt

	Returns:
	dict with response, tokens, and timing
	"""
	try:
	start_time = time.time()

	# Qwen message format
	messages = []
	if system_prompt:
	messages.append({"role": "system", "content": system_prompt})
	messages.append({"role": "user", "content": prompt})

	# Generate
	outputs = pipe(
	messages,
	max_new_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	top_p=TOP_P,
	do_sample=True,
	return_full_text=False,
	)

	response_text = outputs[0]["generated_text"].strip()
	elapsed = time.time() - start_time

	result = {
	"response": response_text,
	"tokens": len(tokenizer.encode(response_text)),
	"time_seconds": round(elapsed, 2),
	"model": MODEL_NAME,
	}

	logger.info(f"Generated {result['tokens']} tokens in {result['time_seconds']}s")
	return result

	except Exception as e:
	logger.error(f"Error generating response: {e}")
	return {
	"response": f"Error: {str(e)}",
	"tokens": 0,
	"time_seconds": 0,
	"error": True,
	}

	# Load plugins
	load_plugins()

	# Gradio Interface
	with gr.Blocks(title="Zephyr-7B AI Chatbot") as demo:
	gr.Markdown("# 🤖 Zephyr-7B Inference Server")
	gr.Markdown("Powered by Hugging Face & Gradio")

	with gr.Row():
	with gr.Column(scale=2):
	system_prompt = gr.Textbox(
	label="System Prompt",
	placeholder="Optional: Definiere die Rolle des Assistenten...",
	lines=3,
	)
	with gr.Column(scale=1):
	temperature = gr.Slider(
	label="Temperature",
	minimum=0.0,
	maximum=2.0,
	value=TEMPERATURE,
	step=0.1,
	)
	top_p = gr.Slider(
	label="Top P",
	minimum=0.0,
	maximum=1.0,
	value=TOP_P,
	step=0.05,
	)

	user_input = gr.Textbox(
	label="Your Message",
	placeholder="Type your message...",
	lines=4,
	)

	submit_btn = gr.Button("🚀 Generate", variant="primary", size="lg")

	output_response = gr.Textbox(
	label="Response",
	interactive=False,
	lines=6,
	)

	output_stats = gr.JSON(label="Statistics", interactive=False)

	def process_input(prompt, system, temp, top):
	old_temp = globals()["TEMPERATURE"]
	old_top = globals()["TOP_P"]
	globals()["TEMPERATURE"] = temp
	globals()["TOP_P"] = top

	result = generate_response(prompt, system)

	globals()["TEMPERATURE"] = old_temp
	globals()["TOP_P"] = old_top

	stats = {
	"Tokens": result["tokens"],
	"Time (s)": result["time_seconds"],
	"Model": result["model"],
	}

	return result["response"], stats

	submit_btn.click(
	process_input,
	inputs=[user_input, system_prompt, temperature, top_p],
	outputs=[output_response, output_stats],
	)

	# Examples
	gr.Examples(
	examples=[
	["Was ist Machine Learning?", "Du bist ein hilfsbereiter AI-Assistent.", 0.7, 0.9],
	["Schreibe einen kurzen Witz", "", 0.9, 0.95],
	["Erkläre Quantencomputing in einfachen Worten", "Du bist ein Physik-Professor.", 0.7, 0.9],
	],
	inputs=[user_input, system_prompt, temperature, top_p],
	)

	if __name__ == "__main__":
	demo.queue().launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	)