File size: 7,190 Bytes
e3fccea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
from typing import List, Tuple

import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import InferenceClient

# Load environment variables from .env if it exists
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
HF_MODEL_ID = os.getenv("HF_MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct")
HF_ENDPOINT_URL = os.getenv("HF_ENDPOINT_URL", "").strip()
SYSTEM_PROMPT = os.getenv(
    "HF_SYSTEM_PROMPT",
    "You are a concise and helpful AI assistant.",
)

# Not strictly requiring HF_TOKEN at import time so that
# the UI can still come up on Hugging Face Spaces. We will
# surface a clear guidance message from within `respond` if
# a token is missing.

# Not creating a global client when we want dynamic model selection; we'll create per-call

# Small, cloud-friendly model suggestions
RECOMMENDED_MODELS = [
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-3B-Instruct",
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
]


def format_prompt(message: str, history: List[Tuple[str, str]]) -> str:
    conversation = [f"System: {SYSTEM_PROMPT}"]
    for user_msg, assistant_msg in history:
        if user_msg:
            conversation.append(f"User: {user_msg}")
        if assistant_msg:
            conversation.append(f"Assistant: {assistant_msg}")
    conversation.append(f"User: {message}")
    conversation.append("Assistant:")
    return "\n".join(conversation)


def respond(
    message: str,
    history: List[Tuple[str, str]],
    model_id: str = HF_MODEL_ID,
    temperature: float = 0.7,
    max_new_tokens: int = 512,
):
    # If no token or endpoint configured, guide the user from the UI.
    if not HF_TOKEN and not HF_ENDPOINT_URL:
        yield (
            "HF_TOKEN ayarlı değil. Hugging Face Space üzerinde Settings > Secrets menüsünden"
            " 'HF_TOKEN' gizli değişkenini ekleyin (veya bir Inference Endpoint URL'si sağlayın)."
        )
        return
    prompt = format_prompt(message, history)
    try:
        # Create client per request to honor selected model or endpoint
        if HF_ENDPOINT_URL:
            local_client = InferenceClient(endpoint=HF_ENDPOINT_URL, token=HF_TOKEN)
        else:
            local_client = InferenceClient(model=(model_id or HF_MODEL_ID), token=HF_TOKEN)

        # Try streaming first
        accumulated = ""
        try:
            stream = local_client.text_generation(
                prompt=prompt,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=0.95,
                stream=True,
                details=False,
                return_full_text=False,
            )
            for chunk in stream:
                token_text = None
                # Newer huggingface_hub may return objects with .token.text
                if hasattr(chunk, "token") and getattr(chunk.token, "text", None):
                    token_text = chunk.token.text
                # Fallback for dict responses
                if token_text is None and isinstance(chunk, dict):
                    token = chunk.get("token") or {}
                    token_text = token.get("text") or chunk.get("generated_text")
                # Fallback if a raw string is ever yielded
                if token_text is None and isinstance(chunk, str):
                    token_text = chunk

                if token_text:
                    accumulated += token_text
                    yield accumulated
        except StopIteration:
            # Some servers may prematurely raise StopIteration; we'll fallback to non-streaming
            pass
        except Exception as stream_err:
            # Log and fallback to non-streaming
            print(f"[HF STREAM ERROR] {stream_err}")

        # Fallback: if nothing streamed, try a single-shot generation
        if not accumulated.strip():
            try:
                result = local_client.text_generation(
                    prompt=prompt,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=0.95,
                    stream=False,
                    details=False,
                    return_full_text=False,
                )
                if isinstance(result, dict):
                    text = result.get("generated_text", "")
                else:
                    text = str(result)
                yield text if text.strip() else "Modelden cevap alınamadı."
            except Exception as nonstream_err:
                # Surface detailed error to the UI instead of a vague message
                err_text = str(nonstream_err).strip()
                response_text = ""
                if hasattr(nonstream_err, "response"):
                    response = getattr(nonstream_err, "response")
                    response_text = getattr(response, "text", "") or ""
                if response_text and response_text not in err_text:
                    err_text = f"{err_text} | {response_text}".strip(" |")
                if not err_text:
                    err_text = repr(nonstream_err)
                print(f"[HF NON-STREAM ERROR] {err_text}")
                yield f"Bir hata oluştu: {err_text}"
    except StopIteration:
        print("[HF API ERROR] StopIteration: API'den yanıt dönerken veri alınamadı.")
        yield "Bir hata oluştu: API'den yanıt alınamadı (StopIteration)."
    except Exception as err:  # pragma: no cover - surface errors to UI
        err_text = str(err).strip()
        response_text = ""
        if hasattr(err, "response"):
            response = getattr(err, "response")
            response_text = getattr(response, "text", "") or ""
        if response_text and response_text not in err_text:
            err_text = f"{err_text} | {response_text}".strip(" |")
        if "model_not_supported" in err_text or "not supported" in err_text:
            yield (
                "Seçilen model erişilebilir görünmüyor. `.env` içindeki `HF_MODEL_ID` "
                "değerini, hesabınızda etkin olan bir Hugging Face sohbet modeli ile güncellemeyi deneyin."
            )
            return
        if not err_text:
            err_text = repr(err)
        print(f"[HF API ERROR] {err_text}")
        yield f"Bir hata oluştu: {err_text}"


demo = gr.ChatInterface(
    respond,
    title="Gradio HF Agent",
    description=(
        "Hugging Face Inference API ile konuşan basit bir sohbet arayüzü. "
        "Aşağıdan model ve üretim ayarlarını değiştirebilirsiniz."
    ),
    theme="soft",
    additional_inputs=[
        gr.Dropdown(
            label="Model ID",
            info="Hugging Face model repository adı",
            choices=RECOMMENDED_MODELS,
            value=HF_MODEL_ID,
            allow_custom_value=True,
        ),
        gr.Slider(
            label="Sıcaklık (temperature)",
            minimum=0.0,
            maximum=1.0,
            value=0.7,
            step=0.05,
        ),
        gr.Slider(
            label="Maksimum yeni token",
            minimum=16,
            maximum=1024,
            value=512,
            step=16,
        ),
    ],
)

if __name__ == "__main__":
    demo.queue().launch()