File size: 2,399 Bytes
efbf778 0731ab6 087a32d efbf778 0731ab6 efbf778 087a32d efbf778 dc76833 0731ab6 ad30dd5 0731ab6 ad30dd5 efbf778 ad30dd5 efbf778 ad30dd5 efbf778 3abf791 efbf778 1f8e7d8 efbf778 1f8e7d8 0731ab6 efbf778 0731ab6 efbf778 5946fc0 0731ab6 5946fc0 0731ab6 5946fc0 efbf778 0731ab6 5946fc0 efbf778 0731ab6 efbf778 0731ab6 efbf778 0731ab6 efbf778 087a32d efbf778 087a32d 8d1c7ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import os
import tarfile
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
# 1. Download the tar.gz archive
archive_path = hf_hub_download(
repo_id="SamOrion/Llama_3.2_3b_Hindi_Pruned",
filename="llama-3.2-3b-hindi-pruned.tar.gz",
repo_type="model",
)
# 2. Extract into './model', stripping the top-level folder
extract_dir = "./model"
os.makedirs(extract_dir, exist_ok=True)
with tarfile.open(archive_path, "r:gz") as tar:
for member in tar.getmembers():
# Skip the first path component (the folder name)
parts = member.name.split("/", 1)
if len(parts) == 2:
member.name = parts[1]
tar.extract(member, path=extract_dir)
# 3. Verify that config.json is at ./model/config.json
config_path = os.path.join(extract_dir, "config.json")
if not os.path.isfile(config_path):
raise FileNotFoundError(f"config.json not found in {extract_dir}")
# 4. Load tokenizer and model straight from './model'
tokenizer = AutoTokenizer.from_pretrained(extract_dir)
model = AutoModelForCausalLM.from_pretrained(
extract_dir,
torch_dtype="auto",
device_map="auto",
low_cpu_mem_usage=True,
)
# 5. Define chat function using OpenAI-style messages
def chat_fn(prompt, history):
history = history or []
history.append({"role": "user", "content": prompt})
# Tokenize and send to device
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
input_length = inputs["input_ids"].shape[1]
# Generate full-sequence tokens (prompt + continuation)
outputs = model.generate(**inputs, max_new_tokens=100)
# Remove the prompt tokens from the start
generated_ids = outputs[0][input_length:]
response = tokenizer.decode(generated_ids, skip_special_tokens=True)
history.append({"role": "assistant", "content": response})
return history, ""
# 6. Build Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## 🌐 Indus 3.0 Hindi LLM Demo")
chat = gr.Chatbot(type="messages")
msg = gr.Textbox(placeholder="Type here…")
clear = gr.Button("Clear")
msg.submit(chat_fn, [msg, chat], [chat, msg])
clear.click(lambda: ([], ""), None, [chat, msg])
# 7. Launch without `share=True` on Spaces
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
|