Spaces:
Sleeping
Sleeping
| import sys | |
| import types | |
| # Python 3.13 compat: audioop removed; stub it so pydub/gradio can load | |
| try: | |
| import audioop # noqa: F401 | |
| except ModuleNotFoundError: | |
| sys.modules["audioop"] = types.ModuleType("audioop") | |
| # Lazy-install llama-cpp-python (avoids build-time OOM/timeout on HF Spaces) | |
| def _ensure_llama_cpp(): | |
| try: | |
| import llama_cpp # noqa: F401 | |
| except ImportError: | |
| import subprocess | |
| print("Installing llama-cpp-python (first run — may take ~2 min)...", flush=True) | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", "-q", | |
| "--extra-index-url", | |
| "https://abetlen.github.io/llama-cpp-python/whl/cpu", | |
| "llama-cpp-python", | |
| ]) | |
| _ensure_llama_cpp() | |
| import gradio as gr | |
| import requests | |
| import os | |
| import time | |
| import psutil | |
| MAX_RAM_MB = 2048 | |
| DOWNLOAD_DIR = "/tmp/models" | |
| TEST_PROMPT = "Hi Mina, aiyo today so hot sia" | |
| def check_model_size(url): | |
| try: | |
| head = requests.head(url, allow_redirects=True, timeout=10) | |
| content_length = head.headers.get("content-length") | |
| if content_length: | |
| return int(content_length) / (1024 * 1024), None | |
| except Exception as e: | |
| return None, str(e) | |
| return None, None | |
| def download_model(url): | |
| os.makedirs(DOWNLOAD_DIR, exist_ok=True) | |
| filename = url.split("/")[-1].split("?")[0] | |
| filepath = os.path.join(DOWNLOAD_DIR, filename) | |
| size_mb, err = check_model_size(url) | |
| if err: | |
| return None, f"Cannot reach URL: {err}" | |
| if size_mb and size_mb > MAX_RAM_MB: | |
| return None, ( | |
| f"Model too large for Raspberry Pi 4: {size_mb:.1f}MB > 2GB limit\n" | |
| "Use Q2_K quantization to reduce model size." | |
| ) | |
| try: | |
| with requests.get(url, stream=True, timeout=30) as r: | |
| r.raise_for_status() | |
| downloaded = 0 | |
| with open(filepath, "wb") as f: | |
| for chunk in r.iter_content(chunk_size=65536): | |
| f.write(chunk) | |
| downloaded += len(chunk) | |
| if downloaded / (1024 * 1024) > MAX_RAM_MB: | |
| os.remove(filepath) | |
| return None, "Download exceeded 2GB Raspberry Pi 4 limit" | |
| return filepath, None | |
| except Exception as e: | |
| return None, str(e) | |
| def run_inference(model_url): | |
| if not model_url or not model_url.strip(): | |
| return "No URL provided", "", "", "FAIL" | |
| model_url = model_url.strip() | |
| if ".gguf" not in model_url.lower(): | |
| return "Only GGUF format supported on Raspberry Pi 4", "", "", "FAIL" | |
| yield "Checking model size...", "", "", "IN PROGRESS" | |
| filepath, error = download_model(model_url) | |
| if error: | |
| yield f"{error}", "", "", "FAIL" | |
| return | |
| try: | |
| from llama_cpp import Llama | |
| yield "Loading model on simulated ARM CPU...", "", "", "IN PROGRESS" | |
| mem_before = psutil.Process().memory_info().rss / (1024 * 1024) | |
| t_start = time.time() | |
| llm = Llama(model_path=filepath, n_ctx=256, n_threads=2, verbose=False) | |
| output = llm(TEST_PROMPT, max_tokens=64, echo=False) | |
| t_end = time.time() | |
| mem_after = psutil.Process().memory_info().rss / (1024 * 1024) | |
| inference_ms = (t_end - t_start) * 1000 | |
| memory_used_mb = mem_after - mem_before | |
| output_text = output["choices"][0]["text"].strip() | |
| badge = ( | |
| "PASS - Fits on Raspberry Pi 4 (2GB)" | |
| if memory_used_mb <= MAX_RAM_MB | |
| else f"FAIL - Memory {memory_used_mb:.0f}MB exceeded 2GB Pi 4 limit" | |
| ) | |
| yield ( | |
| f"{inference_ms:.0f} ms", | |
| f"{memory_used_mb:.0f} MB", | |
| output_text, | |
| badge, | |
| ) | |
| except Exception as e: | |
| yield "Inference error", "", str(e), "FAIL" | |
| finally: | |
| if filepath and os.path.exists(filepath): | |
| os.remove(filepath) | |
| with gr.Blocks(title="Virtual Raspberry Pi 4", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| "# Virtual Raspberry Pi 4\n" | |
| "**Edge AI Test Environment - 2GB RAM Limit**\n\n" | |
| "*IoT / Embedded Linux deployment testing for Project Mina*\n\n" | |
| "> Simulates ARM Cortex-A72 with 2GB RAM. Use Q2_K models." | |
| ) | |
| with gr.Row(): | |
| model_url_input = gr.Textbox( | |
| label="GGUF Model URL", | |
| placeholder="https://huggingface.co/user/repo/resolve/main/model-q2_k.gguf", | |
| scale=4, | |
| ) | |
| run_btn = gr.Button("Run Test", variant="primary", scale=1) | |
| gr.Markdown(f"**Test prompt:** `{TEST_PROMPT}`") | |
| with gr.Row(): | |
| inference_time_out = gr.Textbox(label="Inference Time", interactive=False) | |
| memory_used_out = gr.Textbox(label="Memory Used", interactive=False) | |
| output_text_out = gr.Textbox(label="Model Output", interactive=False, lines=4) | |
| status_out = gr.Textbox(label="Result Badge", interactive=False) | |
| run_btn.click( | |
| run_inference, | |
| inputs=[model_url_input], | |
| outputs=[inference_time_out, memory_used_out, output_text_out, status_out], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |