import subprocess import sys # Install required packages def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) # Install dependencies packages = [ "transformers==4.34.0", "torch==2.0.1+cu118", "gradio==3.39.0", "accelerate==0.23.0", "bitsandbytes==0.41.1", "sentencepiece==0.1.99", "python-dotenv==1.0.0" ] for package in packages: try: install(package) except Exception as e: print(f"Failed to install {package}: {e}") # Install bitsandbytes from Test PyPI if needed try: import bitsandbytes except ImportError: install("bitsandbytes -i https://test.pypi.org/simple/") from transformers import LlamaTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import gradio as gr from huggingface_hub import login from dotenv import load_dotenv import os import torch # Load environment variables load_dotenv() # Log in to Hugging Face huggingface_token = os.getenv("HUGGINGFACE_TOKEN") if huggingface_token: login(token=huggingface_token) else: raise ValueError("HUGGINGFACE_TOKEN is missing in .env file!") # Model configuration model_name = "mistralai/Mistral-7B-v0.3" # Load tokenizer tokenizer = LlamaTokenizer.from_pretrained(model_name) # Check if torch is correctly installed try: assert torch.cuda.is_available(), "CUDA is not available. Install CUDA or use CPU mode." except AssertionError as e: print(e) print("Falling back to CPU mode.") device_map = "cpu" else: device_map = "auto" # 4-bit quantization for better performance bnb_config = BitsAndBytesConfig( load_in_4bit=True, # Change to load_in_8bit=True if needed bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) # Load model with optimized settings model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map=device_map, torch_dtype=torch.float16 ) def respond(message, history): inputs = tokenizer( f"User: {message}\nAssistant:", return_tensors="pt", return_attention_mask=True ).to(model.device) outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, do_sample=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1] # Create optimized interface gr.ChatInterface( respond, title="Shërbimi i Konsumatorit", examples=["Si mund të rivendos fjalëkalimin?", "A e keni në dispozicion këtë produkt?"], cache_examples=True ).launch(server_port=7860, share=True)