Upload folder using huggingface_hub
Browse files- .gradio/flagged/dataset1.csv +2 -0
- __pycache__/app.cpython-314.pyc +0 -0
- app.py +35 -15
- vocab_12k/corpus_tokenized_12000.vocab +0 -0
.gradio/flagged/dataset1.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Shona Input,Max New Tokens,Temperature,output,timestamp
|
| 2 |
+
"{""mhoro"":""urise"", ""ndinofara"":""iri bho"", ""ndipo marii"":}",125,0.9,"{""mhoro"":""urise"", ""ndinofara"":""iri bho"", ""ndipo marii"":} Akatsanangura kutenderedzwa kwemiti, uye akati akazvifambisa kubhawa. Mhedzisiro yacho yaiva yekuti kana vafambi vakabuda ndokubvunisana kuti: ""Izvi hazvisizvo uye zvinokwanisika"". Mumashoko, izvi zvaireva kuti Pauro aiva * napo* ndiye akakubvunza chero chinhu. Handisi kukuudzai kuti ndimuratidze kuti ndiri ani. Izvi zvingava zvisiri izvo. Ndeipi IMHO, zvinoita sokuti kwakanaka. Ndinokutendai!! Ndakanzwa kuti mitambo miviri haisi kuratidzwa pane yangu, asi ndakaiisa mumashoko mugore rokupedzisira rechi11, uye ndiri kuzviita mhando yemhando. Yeuka kuti hapana mumwe",2026-03-04 17:30:24.173349
|
__pycache__/app.cpython-314.pyc
ADDED
|
Binary file (3.07 kB). View file
|
|
|
app.py
CHANGED
|
@@ -1,42 +1,62 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from transformers import LlamaTokenizer
|
| 3 |
import torch
|
| 4 |
import sys
|
| 5 |
import os
|
|
|
|
| 6 |
|
| 7 |
# Add local path to import custom model
|
| 8 |
sys.path.append(os.path.join(os.path.dirname(__file__), "model"))
|
| 9 |
from modeling_ngwanda import NgwandaModel
|
| 10 |
|
| 11 |
-
# Load the
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def predict(text, tokens, temp):
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
with torch.no_grad():
|
| 24 |
outputs = model.generate(
|
| 25 |
-
|
| 26 |
-
max_new_tokens=int(tokens),
|
| 27 |
temperature=float(temp),
|
| 28 |
-
do_sample=True
|
| 29 |
)
|
|
|
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# decode the original input plus newly generated
|
| 32 |
-
out_text =
|
|
|
|
| 33 |
return out_text
|
| 34 |
|
| 35 |
# Gradio Interface (Text Completion style)
|
| 36 |
gr.Interface(
|
| 37 |
fn=predict,
|
| 38 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
outputs="text",
|
| 40 |
-
title="
|
| 41 |
description="A base language model for Shona."
|
| 42 |
).launch()
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import torch
|
| 3 |
import sys
|
| 4 |
import os
|
| 5 |
+
import sentencepiece as spm
|
| 6 |
|
| 7 |
# Add local path to import custom model
|
| 8 |
sys.path.append(os.path.join(os.path.dirname(__file__), "model"))
|
| 9 |
from modeling_ngwanda import NgwandaModel
|
| 10 |
|
| 11 |
+
# Load the SentencePiece tokenizer directly
|
| 12 |
+
sp = spm.SentencePieceProcessor(model_file="vocab_12k/corpus_tokenized_12000.model")
|
| 13 |
+
bos_token = "<|startoftext|>"
|
| 14 |
+
bos_id = sp.piece_to_id(bos_token)
|
| 15 |
+
|
| 16 |
+
# Load your Ngwanda base model from the new HF repository
|
| 17 |
+
model = NgwandaModel.from_pretrained("takuM23/ShonaTransformer-Basemodel")
|
| 18 |
|
| 19 |
def predict(text, tokens, temp):
|
| 20 |
+
print(text)
|
| 21 |
+
|
| 22 |
+
# encode as IDs
|
| 23 |
+
input_ids = sp.encode_as_ids(text)
|
| 24 |
+
if not text.startswith(bos_token):
|
| 25 |
+
input_ids = [bos_id] + input_ids
|
| 26 |
|
| 27 |
+
inputs_tensor = torch.tensor([input_ids])
|
| 28 |
+
print({'input_ids': inputs_tensor})
|
| 29 |
|
| 30 |
with torch.no_grad():
|
| 31 |
outputs = model.generate(
|
| 32 |
+
input_ids=inputs_tensor,
|
| 33 |
+
max_new_tokens=int(tokens),
|
| 34 |
temperature=float(temp),
|
| 35 |
+
do_sample=True,
|
| 36 |
)
|
| 37 |
+
|
| 38 |
+
out_ids = outputs[0].tolist()
|
| 39 |
|
| 40 |
+
# filter out special tokens manually
|
| 41 |
+
special_ids = {sp.bos_id(), sp.eos_id(), sp.pad_id(), sp.unk_id(),
|
| 42 |
+
sp.piece_to_id("<|startoftext|>"), sp.piece_to_id("<|endofturn|>"),
|
| 43 |
+
sp.piece_to_id("<|user|>"), sp.piece_to_id("<|agent|>")}
|
| 44 |
+
out_ids = [i for i in out_ids if i not in special_ids]
|
| 45 |
+
|
| 46 |
# decode the original input plus newly generated
|
| 47 |
+
out_text = sp.decode(out_ids)
|
| 48 |
+
print("Otput text ", out_text)
|
| 49 |
return out_text
|
| 50 |
|
| 51 |
# Gradio Interface (Text Completion style)
|
| 52 |
gr.Interface(
|
| 53 |
fn=predict,
|
| 54 |
+
inputs=[
|
| 55 |
+
gr.Textbox(label="Shona Input"),
|
| 56 |
+
gr.Slider(10, 200, value=50, step=1, label="Max New Tokens"),
|
| 57 |
+
gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature"),
|
| 58 |
+
],
|
| 59 |
outputs="text",
|
| 60 |
+
title="Ngwanda Base Model",
|
| 61 |
description="A base language model for Shona."
|
| 62 |
).launch()
|
vocab_12k/corpus_tokenized_12000.vocab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|