| import gradio as gr |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import gc |
| import os |
| import datetime |
| import time |
| import spaces |
|
|
| |
| MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-1.5B" |
| MAX_NEW_TOKENS = 512 |
| USE_GPU = True |
|
|
| |
| HF_TOKEN = os.getenv("HF_TOKEN") |
| if not HF_TOKEN: |
| print("κ²½κ³ : HF_TOKEN νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€. λΉκ³΅κ° λͺ¨λΈμ μ κ·Όν μ μμ μ μμ΅λλ€.") |
|
|
| |
| print("--- Environment Setup ---") |
| device = torch.device("cuda" if torch.cuda.is_available() and USE_GPU else "cpu") |
| print(f"PyTorch version: {torch.__version__}") |
| print(f"Running on device: {device}") |
| print(f"Torch Threads: {torch.get_num_threads()}") |
| print(f"HF_TOKEN μ€μ μ¬λΆ: {'μμ' if HF_TOKEN else 'μμ'}") |
|
|
| |
| custom_css = """ |
| .gradio-container { |
| max-width: 850px !important; |
| margin: auto; |
| } |
| .gr-chat { |
| border-radius: 10px; |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
| } |
| .user-message { |
| background-color: #f0f7ff !important; |
| border-radius: 8px; |
| } |
| .assistant-message { |
| background-color: #f9f9f9 !important; |
| border-radius: 8px; |
| } |
| .gr-button.primary-button { |
| background-color: #1f4e79 !important; |
| } |
| .gr-form { |
| padding: 20px; |
| border-radius: 10px; |
| box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05); |
| } |
| #intro-message { |
| text-align: center; |
| margin-bottom: 20px; |
| padding: 15px; |
| background: linear-gradient(135deg, #e8f4ff 0%, #f0f7ff 100%); |
| border-radius: 10px; |
| border-left: 4px solid #1f4e79; |
| } |
| .footer { |
| text-align: center; |
| margin-top: 20px; |
| font-size: 0.8em; |
| color: #666; |
| } |
| """ |
|
|
| |
| print(f"--- Loading Model: {MODEL_ID} ---") |
| print("This might take a few minutes, especially on the first launch...") |
|
|
| model = None |
| tokenizer = None |
| load_successful = False |
| stop_token_ids_list = [] |
|
|
| try: |
| start_load_time = time.time() |
| |
| |
| tokenizer_kwargs = { |
| "trust_remote_code": True |
| } |
| |
| |
| if HF_TOKEN: |
| tokenizer_kwargs["token"] = HF_TOKEN |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| MODEL_ID, |
| **tokenizer_kwargs |
| ) |
| |
| |
| model_kwargs = { |
| "trust_remote_code": True, |
| "device_map": "auto" if device.type == "cuda" else "cpu", |
| "torch_dtype": torch.float16 if device.type == "cuda" else torch.float32, |
| } |
| |
| |
| if HF_TOKEN: |
| model_kwargs["token"] = HF_TOKEN |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| **model_kwargs |
| ) |
| |
| model.eval() |
| load_time = time.time() - start_load_time |
| print(f"--- Model and Tokenizer Loaded Successfully in {load_time:.2f} seconds ---") |
| load_successful = True |
|
|
| |
| stop_token_strings = ["<|endofturn|>", "<|stop|>"] |
| temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings] |
|
|
| if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids: |
| temp_stop_ids.append(tokenizer.eos_token_id) |
| elif tokenizer.eos_token_id is None: |
| print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.") |
|
|
| stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] |
|
|
| if not stop_token_ids_list: |
| print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.") |
| if tokenizer.eos_token_id is not None: |
| stop_token_ids_list = [tokenizer.eos_token_id] |
| else: |
| print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.") |
|
|
| print(f"Using Stop Token IDs: {stop_token_ids_list}") |
|
|
| except Exception as e: |
| print(f"!!! Error loading model: {e}") |
| if 'model' in locals() and model is not None: del model |
| if 'tokenizer' in locals() and tokenizer is not None: del tokenizer |
| gc.collect() |
| |
| raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}") |
|
|
| |
| def get_system_prompt(): |
| current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)") |
| return ( |
| f"- AI μΈμ΄λͺ¨λΈμ μ΄λ¦μ \"CLOVA X\" μ΄λ©° λ€μ΄λ²μμ λ§λ€μλ€.\n" |
| f"- μ€λμ {current_date}μ΄λ€.\n" |
| f"- μ¬μ©μμ μ§λ¬Έμ λν΄ μΉμ νκ³ μμΈνκ² νκ΅μ΄λ‘ λ΅λ³ν΄μΌ νλ€." |
| ) |
|
|
| |
| def warmup_model(): |
| if not load_successful or model is None or tokenizer is None: |
| print("Skipping warmup: Model not loaded successfully.") |
| return |
|
|
| print("--- Starting Model Warm-up ---") |
| try: |
| start_warmup_time = time.time() |
| warmup_message = "μλ
νμΈμ" |
| system_prompt = get_system_prompt() |
| warmup_chat = [ |
| {"role": "tool_list", "content": ""}, |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": warmup_message} |
| ] |
|
|
| inputs = tokenizer.apply_chat_template( |
| warmup_chat, |
| add_generation_prompt=True, |
| return_dict=True, |
| return_tensors="pt" |
| ).to(device) |
|
|
| |
| gen_kwargs = { |
| "max_new_tokens": 10, |
| "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, |
| "do_sample": False |
| } |
| if stop_token_ids_list: |
| gen_kwargs["eos_token_id"] = stop_token_ids_list |
| else: |
| print("Warmup Warning: No stop tokens defined for generation.") |
|
|
| with torch.no_grad(): |
| output_ids = model.generate(**inputs, **gen_kwargs) |
|
|
| del inputs |
| del output_ids |
| gc.collect() |
| warmup_time = time.time() - start_warmup_time |
| print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---") |
|
|
| except Exception as e: |
| print(f"!!! Error during model warm-up: {e}") |
| finally: |
| gc.collect() |
|
|
| |
| @spaces.GPU() |
| def predict(message, history): |
| """ |
| Generates response using HyperCLOVAX. |
| Assumes 'history' is in the Gradio 'messages' format: List[Dict]. |
| """ |
| if model is None or tokenizer is None: |
| return "μ€λ₯: λͺ¨λΈμ΄ λ‘λλμ§ μμμ΅λλ€." |
|
|
| system_prompt = get_system_prompt() |
|
|
| |
| chat_history_formatted = [ |
| {"role": "tool_list", "content": ""}, |
| {"role": "system", "content": system_prompt} |
| ] |
|
|
| |
| if isinstance(history, list): |
| for user_msg, assistant_msg in history: |
| chat_history_formatted.append({"role": "user", "content": user_msg}) |
| if assistant_msg: |
| chat_history_formatted.append({"role": "assistant", "content": assistant_msg}) |
|
|
| |
| chat_history_formatted.append({"role": "user", "content": message}) |
|
|
| inputs = None |
| output_ids = None |
|
|
| try: |
| inputs = tokenizer.apply_chat_template( |
| chat_history_formatted, |
| add_generation_prompt=True, |
| return_dict=True, |
| return_tensors="pt" |
| ).to(device) |
| input_length = inputs['input_ids'].shape[1] |
| print(f"\nInput tokens: {input_length}") |
|
|
| except Exception as e: |
| print(f"!!! Error applying chat template: {e}") |
| return f"μ€λ₯: μ
λ ₯ νμμ μ²λ¦¬νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" |
|
|
| try: |
| print("Generating response...") |
| generation_start_time = time.time() |
|
|
| |
| gen_kwargs = { |
| "max_new_tokens": MAX_NEW_TOKENS, |
| "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id, |
| "do_sample": True, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| } |
| if stop_token_ids_list: |
| gen_kwargs["eos_token_id"] = stop_token_ids_list |
| else: |
| print("Generation Warning: No stop tokens defined.") |
|
|
| with torch.no_grad(): |
| output_ids = model.generate(**inputs, **gen_kwargs) |
|
|
| generation_time = time.time() - generation_start_time |
| print(f"Generation complete in {generation_time:.2f} seconds.") |
|
|
| except Exception as e: |
| print(f"!!! Error during model generation: {e}") |
| if inputs is not None: del inputs |
| if output_ids is not None: del output_ids |
| gc.collect() |
| return f"μ€λ₯: μλ΅μ μμ±νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€. ({e})" |
|
|
| |
| response = "μ€λ₯: μλ΅ μμ±μ μ€ν¨νμ΅λλ€." |
| if output_ids is not None: |
| try: |
| new_tokens = output_ids[0, input_length:] |
| response = tokenizer.decode(new_tokens, skip_special_tokens=True) |
| print(f"Output tokens: {len(new_tokens)}") |
| del new_tokens |
| except Exception as e: |
| print(f"!!! Error decoding response: {e}") |
| response = "μ€λ₯: μλ΅μ λμ½λ©νλ μ€ λ¬Έμ κ° λ°μνμ΅λλ€." |
|
|
| |
| if inputs is not None: del inputs |
| if output_ids is not None: del output_ids |
| gc.collect() |
| print("Memory cleaned.") |
|
|
| return response |
|
|
| |
| print("--- Setting up Gradio Interface ---") |
|
|
| with gr.Blocks(css=custom_css) as demo: |
| gr.Markdown(""" |
| # NAVER hyperclovax: HyperCLOVAX-SEED-Text-Instruct-1.5B |
| |
| """, elem_id="intro-message") |
| |
| |
| chatbot = gr.ChatInterface( |
| fn=predict, |
| examples=[ |
| ["λ€μ΄λ² ν΄λ‘λ°Xλ 무μμΈκ°μ?"], |
| ["μλ’°λ©κ±° λ°©μ μκ³Ό μμμνμ κ΄κ³λ₯Ό μ€λͺ
ν΄μ£ΌμΈμ."], |
| ["λ₯λ¬λ λͺ¨λΈ νμ΅ κ³Όμ μ λ¨κ³λ³λ‘ μλ €μ€."], |
| ["μ μ£Όλ μ¬ν κ³νμ μΈμ°κ³ μλλ°, 3λ° 4μΌ μΆμ² μ½μ€ μ’ μ§μ€λ?"], |
| ["νκ΅ μμ¬μμ κ°μ₯ μ€μν μ¬κ±΄ 5κ°μ§λ 무μμΈκ°μ?"], |
| ["μΈκ³΅μ§λ₯ μ€λ¦¬μ λν΄ μ€λͺ
ν΄μ£ΌμΈμ."], |
| ], |
| cache_examples=False, |
| ) |
| |
| with gr.Accordion("λͺ¨λΈ μ 보", open=False): |
| gr.Markdown(f""" |
| - **λͺ¨λΈ**: {MODEL_ID} |
| - **νκ²½**: ZeroGPU 곡μ νκ²½μμ μ€ν μ€ |
| - **ν ν° μ ν**: μ΅λ μμ± ν ν° μλ {MAX_NEW_TOKENS}κ°λ‘ μ νλ©λλ€. |
| - **νλμ¨μ΄**: {"GPU" if device.type == "cuda" else "CPU"} νκ²½μμ μ€ν μ€ |
| """) |
| |
| gr.Markdown( |
| "Β© 2025 λ€μ΄λ² HyperCLOVA X λ°λͺ¨ | Powered by Hugging Face & ZeroGPU", |
| elem_classes="footer" |
| ) |
|
|
| |
| if __name__ == "__main__": |
| if load_successful: |
| warmup_model() |
| else: |
| print("Skipping warm-up because model loading failed.") |
|
|
| print("--- Launching Gradio App ---") |
| demo.queue().launch( |
| |
| server_name="0.0.0.0" |
| ) |