File size: 4,636 Bytes
b44f4db
 
 
fa7597f
 
 
 
 
 
5f00229
01308c5
 
 
 
 
5f00229
01308c5
 
 
 
 
 
 
 
 
5f00229
01308c5
 
5f00229
01308c5
5f00229
01308c5
 
5f00229
01308c5
 
5f00229
01308c5
 
 
 
5f00229
01308c5
 
5f00229
01308c5
 
 
 
5f00229
01308c5
 
5f00229
01308c5
 
5f00229
01308c5
 
 
 
 
 
 
 
 
5f00229
 
01308c5
 
 
 
5f00229
01308c5
 
5f00229
01308c5
 
 
 
 
 
 
 
 
 
5f00229
 
 
 
01308c5
 
5f00229
 
 
01308c5
5f00229
 
 
 
 
01308c5
 
5f00229
 
 
01308c5
 
 
 
5f00229
 
 
01308c5
5f00229
01308c5
 
 
 
5f00229
 
 
 
 
01308c5
 
 
 
 
 
 
5f00229
01308c5
 
 
 
5f00229
 
01308c5
5f00229
 
01308c5
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import sys
import types

# Python 3.13 compat: audioop removed; stub it so pydub/gradio can load
try:
    import audioop  # noqa: F401
except ModuleNotFoundError:
    sys.modules["audioop"] = types.ModuleType("audioop")

import gradio as gr
import requests
import os
import time
import psutil

MAX_RAM_MB  = 4096
TEST_PROMPT = "Hi Mina, aiyo today so hot sia"


def get_available_memory_mb():
    return psutil.virtual_memory().available / (1024 * 1024)


def run_transformer_inference(model_id):
    if not model_id or not model_id.strip():
        return "No model ID provided", "", "", "FAIL"

    model_id = model_id.strip()
    if model_id.lower().endswith(".gguf"):
        return (
            "GGUF not supported here",
            "",
            "Use munyew/mina-test-honor-magic8 for GGUF models",
            "FAIL - Use the GGUF spaces for GGUF models",
        )

    yield "Loading model from HuggingFace Hub...", "", "", "IN PROGRESS"

    available_mb = get_available_memory_mb()
    if available_mb < 512:
        yield (
            "Insufficient memory",
            f"Only {available_mb:.0f}MB available",
            "",
            "FAIL - Not enough RAM to load any model",
        )
        return

    try:
        from transformers import pipeline
        import torch

        yield "Initialising transformers pipeline (CPU)...", "", "", "IN PROGRESS"

        mem_before = psutil.Process().memory_info().rss / (1024 * 1024)
        t_start    = time.time()

        pipe = pipeline(
            "text-generation",
            model=model_id,
            device="cpu",
            torch_dtype=torch.float32,
            trust_remote_code=True,
        )

        t_loaded    = time.time()
        mem_loaded  = psutil.Process().memory_info().rss / (1024 * 1024)
        load_mem_mb = mem_loaded - mem_before

        if load_mem_mb > MAX_RAM_MB:
            yield (
                f"Model too large: {load_mem_mb:.0f}MB",
                "",
                "",
                f"FAIL - {load_mem_mb:.0f}MB exceeds 4GB cloud minimum limit",
            )
            return

        output = pipe(
            TEST_PROMPT,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )

        t_end        = time.time()
        mem_after    = psutil.Process().memory_info().rss / (1024 * 1024)
        load_time_s  = t_loaded - t_start
        infer_ms     = (t_end - t_loaded) * 1000
        total_mem_mb = mem_after - mem_before

        generated = output[0]["generated_text"]
        if generated.startswith(TEST_PROMPT):
            generated = generated[len(TEST_PROMPT):].strip()

        badge = (
            f"PASS - {total_mem_mb:.0f}MB RAM (within 4GB cloud limit)"
            if total_mem_mb <= MAX_RAM_MB
            else f"FAIL - {total_mem_mb:.0f}MB exceeded 4GB cloud minimum limit"
        )

        yield (
            f"Load: {load_time_s:.1f}s  |  Inference: {infer_ms:.0f}ms",
            f"{total_mem_mb:.0f} MB",
            generated,
            badge,
        )

    except Exception as e:
        err = str(e)
        if "out of memory" in err.lower() or "oom" in err.lower():
            yield "Out of Memory", "", "", "FAIL - OOM on 4GB cloud minimum"
        else:
            yield "Error loading model", "", err, "FAIL"


with gr.Blocks(title="Virtual Cloud Minimum", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        "# Virtual Cloud Minimum\n"
        "**Transformer Model Test - 4GB RAM, CPU Only**\n\n"
        "*Tests HuggingFace transformer models (not GGUF) - for SEA-LION and similar*\n\n"
        "> Provide a HuggingFace model ID (e.g. `aisingapore/llm-sealion-1b`).\n"
        "> GGUF models are not supported here."
    )
    with gr.Row():
        model_id_input = gr.Textbox(
            label="HuggingFace Model ID",
            placeholder="aisingapore/llm-sealion-1b",
            scale=4,
        )
        run_btn = gr.Button("Run Test", variant="primary", scale=1)

    gr.Markdown(f"**Test prompt:** `{TEST_PROMPT}`")

    with gr.Row():
        timing_out      = gr.Textbox(label="Timing",       interactive=False)
        memory_used_out = gr.Textbox(label="Memory Used",  interactive=False)

    output_text_out = gr.Textbox(label="Model Output",  interactive=False, lines=4)
    status_out      = gr.Textbox(label="Result Badge",  interactive=False, lines=2)

    run_btn.click(
        run_transformer_inference,
        inputs=[model_id_input],
        outputs=[timing_out, memory_used_out, output_text_out, status_out],
    )

if __name__ == "__main__":
    demo.launch()