import os
import sys
import subprocess

# Ensure huggingface_hub is installed
try:
    from huggingface_hub import InferenceClient, get_token
except ImportError:
    print("[*] Installing required library 'huggingface_hub'...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "huggingface_hub"])
    from huggingface_hub import InferenceClient, get_token

# Terminal color codes (ANSI escape sequences)
RESET = "\033[0m"
BOLD = "\033[1m"
BLUE = "\033[94m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
CYAN = "\033[96m"

def clear_screen():
    os.system('cls' if os.name == 'nt' else 'clear')

def main():
    clear_screen()
    print(f"{BOLD}{BLUE}" + "="*60 + f"{RESET}")
    print(f"{BOLD}{BLUE}       HUGGING FACE REAL-TIME TERMINAL CLI CHAT{RESET}")
    print(f"{BOLD}{BLUE}" + "="*60 + f"{RESET}")
    
    # 1. Automatic Token Detection
    token = get_token()
    if token:
        print(f"\n{GREEN}[+] Found cached/environment Hugging Face Token! Running in Authenticated Mode.{RESET}")
    else:
        print(f"\n{YELLOW}[!] No cached token found. Running in Anonymous Mode (Rate-limited).{RESET}")
        print(f"{YELLOW}    Tip: Add your 'HF_TOKEN' as a Secret in Space Settings to remove limits.{RESET}")
    
    # 2. Select Model
    print(f"\n{BOLD}Select a model to chat with:{RESET}")
    models = {
        "1": ("DeepSeek-R1 (Reasoning)", "deepseek-ai/DeepSeek-R1"),
        "2": ("Llama-3.3-70B-Instruct", "meta-llama/Llama-3.3-70B-Instruct"),
        "3": ("Qwen2.5-Coder-32B", "Qwen/Qwen2.5-Coder-32B-Instruct"),
        "4": ("Phi-3-Mini-Instruct", "microsoft/Phi-3-mini-4k-instruct"),
        "5": ("Enter Custom Model ID", None)
    }
    
    for key, (name, path) in models.items():
        if path:
            print(f"  [{key}] {name} ({path})")
        else:
            print(f"  [{key}] {name}")
            
    choice = input(f"\nChoose model number (1-5, default: 3): ").strip()
    if not choice:
        choice = "3"
        
    if choice not in models:
        print(f"{RED}[!] Invalid choice. Defaulting to Qwen2.5-Coder-32B.{RESET}")
        choice = "3"
        
    model_name, model_path = models[choice]
    
    if choice == "5":
        model_path = input(f"\nEnter Hugging Face Model ID (e.g. meta-llama/Llama-3.2-3B-Instruct): ").strip()
        if not model_path:
            print(f"{RED}[!] No model ID entered. Exiting.{RESET}")
            return
        model_name = model_path.split("/")[-1]

    print(f"\n{GREEN}[*] Initializing client for model: {model_path}...{RESET}")
    
    # Initialize Inference Client
    try:
        client = InferenceClient(model=model_path, token=token)
    except Exception as e:
        print(f"{RED}[!] Error initializing client: {e}{RESET}")
        return
        
    print(f"\n{BOLD}{CYAN}Chat initialized! Type '/exit' or '/quit' to close the chat.{RESET}")
    print(f"{CYAN}Type '/clear' to clear conversation history.{RESET}")
    print(f"{BLUE}" + "-"*60 + f"{RESET}")

    messages = [
        {"role": "system", "content": "You are a helpful and concise assistant."}
    ]
    
    while True:
        try:
            user_input = input(f"\n{BOLD}{GREEN}You > {RESET}").strip()
            if not user_input:
                continue
                
            if user_input.lower() in ['/exit', '/quit']:
                print(f"\n{YELLOW}Goodbye! (Refresh browser page to restart chat){RESET}")
                break
                
            if user_input.lower() == '/clear':
                messages = [{"role": "system", "content": "You are a helpful and concise assistant."}]
                print(f"{YELLOW}Conversation history cleared!{RESET}")
                continue
                
            messages.append({"role": "user", "content": user_input})
            
            # Print assistant response prefix
            print(f"{BOLD}{BLUE}{model_name} > {RESET}", end="", flush=True)
            
            # Call API with streaming response
            full_response = ""
            try:
                response_stream = client.chat_completion(
                    messages=messages,
                    max_tokens=1024,
                    stream=True
                )
                
                for chunk in response_stream:
                    content = chunk.choices[0].delta.content
                    if content:
                        print(content, end="", flush=True)
                        full_response += content
                print() # New line after streaming ends
                
                # Append assistant response to history
                messages.append({"role": "assistant", "content": full_response})
                
            except Exception as e:
                print(f"\n{RED}[!] Error during generation: {e}{RESET}")
                # Remove the last user message since it failed to get a response
                messages.pop()
                
        except KeyboardInterrupt:
            print(f"\n{YELLOW}Goodbye!{RESET}")
            break

if __name__ == "__main__":
    main()