import os
import subprocess
import sys

# --- 1. Force Install the Correct CPU Version (Runtime Install) ---
def install_llama():
    try:
        import llama_cpp
        print("llama-cpp-python is already installed.")
    except ImportError:
        print("Installing llama-cpp-python for CPU...")
        # We use the specific Index URL for CPU wheels to avoid compiling
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "llama-cpp-python", 
            "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu"
        ])
        print("Installation complete!")

install_llama()


import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 1. Configuration
REPO_ID = "mradermacher/qwen-coder-abap-v6-GGUF"
FILENAME = "qwen-coder-abap-v6.Q4_K_M.gguf" # Best balance of speed/quality

# 2. Download the Model (Cached automatically by HF)
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(
    repo_id=REPO_ID, 
    filename=FILENAME
)

# 3. Load the Model
# n_ctx=8192 allows for long ABAP code files
# n_threads=2 is optimal for the free HF Spaces tier
print("Loading model into memory...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_threads=2, 
    verbose=False
)

# 4. The Generation Function
def generate_abap(message, history):
    # System prompt to enforce ABAP context
    system_prompt = "You are an expert ABAP developer. Write modern ABAP 7.4+ code where possible."
    
    # Construct the prompt using Qwen's ChatML format
    # <|im_start|>system...<|im_end|><|im_start|>user...<|im_end|><|im_start|>assistant
    prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    
    # Add history to keep context (optional, but good for chat)
    for user_msg, bot_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
    
    # Add current message
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    # Streaming generation (Characters appear as they are generated)
    output_stream = llm(
        prompt,
        max_tokens=1024,   # Max length of answer
        stop=["<|im_end|>"], # Stop when finished
        stream=True,       # Enable streaming
        temperature=0.1,   # Precise code
        top_p=0.9
    )

    partial_message = ""
    for chunk in output_stream:
        delta = chunk['choices'][0]['text']
        partial_message += delta
        yield partial_message

# 5. The Gradio Interface
demo = gr.ChatInterface(
    fn=generate_abap,
    title="ABAP Coder (Qwen 2.5 GGUF)",
    description="Ask for ABAP Reports, CDS Views, or Classes. Running on CPU.",
    examples=[
        "Write a report to select data from MARA using inline declarations.",
        "Create a CDS View for sales orders joining VBAK and VBAP.",
        "Explain how to use FIELD-SYMBOLS in a LOOP."
    ],
)

# 6. Launch
if __name__ == "__main__":
    demo.launch()