Chatbot-i / app.py
ShabanEjupi's picture
Update app.py
d9e9533 verified
import subprocess
import sys
# Install required packages
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
# Install dependencies
packages = [
"transformers==4.34.0",
"torch==2.0.1+cu118",
"gradio==3.39.0",
"accelerate==0.23.0",
"bitsandbytes==0.41.1",
"sentencepiece==0.1.99",
"python-dotenv==1.0.0"
]
for package in packages:
try:
install(package)
except Exception as e:
print(f"Failed to install {package}: {e}")
# Install bitsandbytes from Test PyPI if needed
try:
import bitsandbytes
except ImportError:
install("bitsandbytes -i https://test.pypi.org/simple/")
from transformers import LlamaTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch
# Load environment variables
load_dotenv()
# Log in to Hugging Face
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
if huggingface_token:
login(token=huggingface_token)
else:
raise ValueError("HUGGINGFACE_TOKEN is missing in .env file!")
# Model configuration
model_name = "mistralai/Mistral-7B-v0.3"
# Load tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)
# Check if torch is correctly installed
try:
assert torch.cuda.is_available(), "CUDA is not available. Install CUDA or use CPU mode."
except AssertionError as e:
print(e)
print("Falling back to CPU mode.")
device_map = "cpu"
else:
device_map = "auto"
# 4-bit quantization for better performance
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, # Change to load_in_8bit=True if needed
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
# Load model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map,
torch_dtype=torch.float16
)
def respond(message, history):
inputs = tokenizer(
f"User: {message}\nAssistant:",
return_tensors="pt",
return_attention_mask=True
).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True
)
return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1]
# Create optimized interface
gr.ChatInterface(
respond,
title="Shërbimi i Konsumatorit",
examples=["Si mund të rivendos fjalëkalimin?", "A e keni në dispozicion këtë produkt?"],
cache_examples=True
).launch(server_port=7860, share=True)