import subprocess
import sys

# Install required packages
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install dependencies
packages = [
    "transformers==4.34.0",
    "torch==2.0.1+cu118",
    "gradio==3.39.0",
    "accelerate==0.23.0",
    "bitsandbytes==0.41.1",
    "sentencepiece==0.1.99",
    "python-dotenv==1.0.0"
]

for package in packages:
    try:
        install(package)
    except Exception as e:
        print(f"Failed to install {package}: {e}")

# Install bitsandbytes from Test PyPI if needed
try:
    import bitsandbytes
except ImportError:
    install("bitsandbytes -i https://test.pypi.org/simple/")

from transformers import LlamaTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gradio as gr
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch

# Load environment variables
load_dotenv()

# Log in to Hugging Face
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
if huggingface_token:
    login(token=huggingface_token)
else:
    raise ValueError("HUGGINGFACE_TOKEN is missing in .env file!")

# Model configuration
model_name = "mistralai/Mistral-7B-v0.3"

# Load tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# Check if torch is correctly installed
try:
    assert torch.cuda.is_available(), "CUDA is not available. Install CUDA or use CPU mode."
except AssertionError as e:
    print(e)
    print("Falling back to CPU mode.")
    device_map = "cpu"
else:
    device_map = "auto"

# 4-bit quantization for better performance
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Change to load_in_8bit=True if needed
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model with optimized settings
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    torch_dtype=torch.float16
)

def respond(message, history):
    inputs = tokenizer(
        f"User: {message}\nAssistant:",
        return_tensors="pt",
        return_attention_mask=True
    ).to(model.device)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1]

# Create optimized interface
gr.ChatInterface(
    respond,
    title="Shërbimi i Konsumatorit",
    examples=["Si mund të rivendos fjalëkalimin?", "A e keni në dispozicion këtë produkt?"],
    cache_examples=True
).launch(server_port=7860, share=True)