import os import subprocess import sys # --- 1. Force Install the Correct CPU Version (Runtime Install) --- def install_llama(): try: import llama_cpp print("llama-cpp-python is already installed.") except ImportError: print("Installing llama-cpp-python for CPU...") # We use the specific Index URL for CPU wheels to avoid compiling subprocess.check_call([ sys.executable, "-m", "pip", "install", "llama-cpp-python", "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu" ]) print("Installation complete!") install_llama() import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # 1. Configuration REPO_ID = "mradermacher/qwen-coder-abap-v6-GGUF" FILENAME = "qwen-coder-abap-v6.Q4_K_M.gguf" # Best balance of speed/quality # 2. Download the Model (Cached automatically by HF) print(f"Downloading {FILENAME} from {REPO_ID}...") model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME ) # 3. Load the Model # n_ctx=8192 allows for long ABAP code files # n_threads=2 is optimal for the free HF Spaces tier print("Loading model into memory...") llm = Llama( model_path=model_path, n_ctx=8192, n_threads=2, verbose=False ) # 4. The Generation Function def generate_abap(message, history): # System prompt to enforce ABAP context system_prompt = "You are an expert ABAP developer. Write modern ABAP 7.4+ code where possible." # Construct the prompt using Qwen's ChatML format # <|im_start|>system...<|im_end|><|im_start|>user...<|im_end|><|im_start|>assistant prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" # Add history to keep context (optional, but good for chat) for user_msg, bot_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n" # Add current message prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" # Streaming generation (Characters appear as they are generated) output_stream = llm( prompt, max_tokens=1024, # Max length of answer stop=["<|im_end|>"], # Stop when finished stream=True, # Enable streaming temperature=0.1, # Precise code top_p=0.9 ) partial_message = "" for chunk in output_stream: delta = chunk['choices'][0]['text'] partial_message += delta yield partial_message # 5. The Gradio Interface demo = gr.ChatInterface( fn=generate_abap, title="ABAP Coder (Qwen 2.5 GGUF)", description="Ask for ABAP Reports, CDS Views, or Classes. Running on CPU.", examples=[ "Write a report to select data from MARA using inline declarations.", "Create a CDS View for sales orders joining VBAK and VBAP.", "Explain how to use FIELD-SYMBOLS in a LOOP." ], ) # 6. Launch if __name__ == "__main__": demo.launch()