Spaces:

SamOrion
/

Indus_3.0

Sleeping

File size: 2,399 Bytes

efbf778
 
0731ab6
 
087a32d
 
efbf778
0731ab6
 
 
efbf778
087a32d
efbf778
 
dc76833
0731ab6
ad30dd5
0731ab6
ad30dd5
efbf778
 
 
 
 
ad30dd5
efbf778
ad30dd5
efbf778
 
3abf791
efbf778
 
 
 
1f8e7d8
 
efbf778
1f8e7d8
0731ab6
efbf778
0731ab6
 
 
efbf778
5946fc0
0731ab6
5946fc0
 
 
0731ab6
5946fc0
 
 
 
efbf778
0731ab6
 
 
5946fc0
efbf778
0731ab6
efbf778
0731ab6
efbf778
0731ab6
 
efbf778
087a32d
efbf778
087a32d
8d1c7ea

import os
import tarfile
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

# 1. Download the tar.gz archive
archive_path = hf_hub_download(
    repo_id="SamOrion/Llama_3.2_3b_Hindi_Pruned",
    filename="llama-3.2-3b-hindi-pruned.tar.gz",
    repo_type="model",
)

# 2. Extract into './model', stripping the top-level folder
extract_dir = "./model"
os.makedirs(extract_dir, exist_ok=True)

with tarfile.open(archive_path, "r:gz") as tar:
    for member in tar.getmembers():
        # Skip the first path component (the folder name)
        parts = member.name.split("/", 1)
        if len(parts) == 2:
            member.name = parts[1]
            tar.extract(member, path=extract_dir)

# 3. Verify that config.json is at ./model/config.json
config_path = os.path.join(extract_dir, "config.json")
if not os.path.isfile(config_path):
    raise FileNotFoundError(f"config.json not found in {extract_dir}")

# 4. Load tokenizer and model straight from './model'
tokenizer = AutoTokenizer.from_pretrained(extract_dir)
model = AutoModelForCausalLM.from_pretrained(
    extract_dir,
    torch_dtype="auto",
    device_map="auto",
    low_cpu_mem_usage=True,
)

# 5. Define chat function using OpenAI-style messages
def chat_fn(prompt, history):
    history = history or []
    history.append({"role": "user", "content": prompt})

    # Tokenize and send to device
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs["input_ids"].shape[1]

    # Generate full-sequence tokens (prompt + continuation)
    outputs = model.generate(**inputs, max_new_tokens=100)

    # Remove the prompt tokens from the start
    generated_ids = outputs[0][input_length:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)

    history.append({"role": "assistant", "content": response})
    return history, ""


# 6. Build Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🌐 Indus 3.0 Hindi LLM Demo")
    chat = gr.Chatbot(type="messages")
    msg = gr.Textbox(placeholder="Type here…")
    clear = gr.Button("Clear")
    msg.submit(chat_fn, [msg, chat], [chat, msg])
    clear.click(lambda: ([], ""), None, [chat, msg])

# 7. Launch without `share=True` on Spaces
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)