takuM23 commited on
Commit
ab4905b
·
1 Parent(s): e843494

Upload folder using huggingface_hub

Browse files
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Shona Input,Max New Tokens,Temperature,output,timestamp
2
+ "{""mhoro"":""urise"", ""ndinofara"":""iri bho"", ""ndipo marii"":}",125,0.9,"{""mhoro"":""urise"", ""ndinofara"":""iri bho"", ""ndipo marii"":} Akatsanangura kutenderedzwa kwemiti, uye akati akazvifambisa kubhawa. Mhedzisiro yacho yaiva yekuti kana vafambi vakabuda ndokubvunisana kuti: ""Izvi hazvisizvo uye zvinokwanisika"". Mumashoko, izvi zvaireva kuti Pauro aiva * napo* ndiye akakubvunza chero chinhu. Handisi kukuudzai kuti ndimuratidze kuti ndiri ani. Izvi zvingava zvisiri izvo. Ndeipi IMHO, zvinoita sokuti kwakanaka. Ndinokutendai!! Ndakanzwa kuti mitambo miviri haisi kuratidzwa pane yangu, asi ndakaiisa mumashoko mugore rokupedzisira rechi11, uye ndiri kuzviita mhando yemhando. Yeuka kuti hapana mumwe",2026-03-04 17:30:24.173349
__pycache__/app.cpython-314.pyc ADDED
Binary file (3.07 kB). View file
 
app.py CHANGED
@@ -1,42 +1,62 @@
1
  import gradio as gr
2
- from transformers import LlamaTokenizer
3
  import torch
4
  import sys
5
  import os
 
6
 
7
  # Add local path to import custom model
8
  sys.path.append(os.path.join(os.path.dirname(__file__), "model"))
9
  from modeling_ngwanda import NgwandaModel
10
 
11
- # Load the wrapped tokenizer
12
- tokenizer = LlamaTokenizer.from_pretrained("./ngwanda-tokenizer-hf")
13
- # Load your Ngwanda base model
14
- model = NgwandaModel.from_pretrained("./model")
 
 
 
15
 
16
  def predict(text, tokens, temp):
17
- # Base models need the BOS token to start properly
18
- if not text.startswith(tokenizer.bos_token):
19
- text = tokenizer.bos_token + text
 
 
 
20
 
21
- inputs = tokenizer(text, return_tensors="pt")
 
22
 
23
  with torch.no_grad():
24
  outputs = model.generate(
25
- **inputs,
26
- max_new_tokens=int(tokens),
27
  temperature=float(temp),
28
- do_sample=True
29
  )
 
 
30
 
 
 
 
 
 
 
31
  # decode the original input plus newly generated
32
- out_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
33
  return out_text
34
 
35
  # Gradio Interface (Text Completion style)
36
  gr.Interface(
37
  fn=predict,
38
- inputs=[gr.Textbox(label="Shona Input"), gr.Slider(10, 200, value=50, step=1, label="Max New Tokens"), gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature")],
 
 
 
 
39
  outputs="text",
40
- title="Shona Base Model",
41
  description="A base language model for Shona."
42
  ).launch()
 
1
  import gradio as gr
 
2
  import torch
3
  import sys
4
  import os
5
+ import sentencepiece as spm
6
 
7
  # Add local path to import custom model
8
  sys.path.append(os.path.join(os.path.dirname(__file__), "model"))
9
  from modeling_ngwanda import NgwandaModel
10
 
11
+ # Load the SentencePiece tokenizer directly
12
+ sp = spm.SentencePieceProcessor(model_file="vocab_12k/corpus_tokenized_12000.model")
13
+ bos_token = "<|startoftext|>"
14
+ bos_id = sp.piece_to_id(bos_token)
15
+
16
+ # Load your Ngwanda base model from the new HF repository
17
+ model = NgwandaModel.from_pretrained("takuM23/ShonaTransformer-Basemodel")
18
 
19
  def predict(text, tokens, temp):
20
+ print(text)
21
+
22
+ # encode as IDs
23
+ input_ids = sp.encode_as_ids(text)
24
+ if not text.startswith(bos_token):
25
+ input_ids = [bos_id] + input_ids
26
 
27
+ inputs_tensor = torch.tensor([input_ids])
28
+ print({'input_ids': inputs_tensor})
29
 
30
  with torch.no_grad():
31
  outputs = model.generate(
32
+ input_ids=inputs_tensor,
33
+ max_new_tokens=int(tokens),
34
  temperature=float(temp),
35
+ do_sample=True,
36
  )
37
+
38
+ out_ids = outputs[0].tolist()
39
 
40
+ # filter out special tokens manually
41
+ special_ids = {sp.bos_id(), sp.eos_id(), sp.pad_id(), sp.unk_id(),
42
+ sp.piece_to_id("<|startoftext|>"), sp.piece_to_id("<|endofturn|>"),
43
+ sp.piece_to_id("<|user|>"), sp.piece_to_id("<|agent|>")}
44
+ out_ids = [i for i in out_ids if i not in special_ids]
45
+
46
  # decode the original input plus newly generated
47
+ out_text = sp.decode(out_ids)
48
+ print("Otput text ", out_text)
49
  return out_text
50
 
51
  # Gradio Interface (Text Completion style)
52
  gr.Interface(
53
  fn=predict,
54
+ inputs=[
55
+ gr.Textbox(label="Shona Input"),
56
+ gr.Slider(10, 200, value=50, step=1, label="Max New Tokens"),
57
+ gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature"),
58
+ ],
59
  outputs="text",
60
+ title="Ngwanda Base Model",
61
  description="A base language model for Shona."
62
  ).launch()
vocab_12k/corpus_tokenized_12000.vocab ADDED
The diff for this file is too large to render. See raw diff