Sammaali commited on
Commit
b61fa99
·
verified ·
1 Parent(s): c147ba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -13
app.py CHANGED
@@ -22,23 +22,27 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
22
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_id,
25
- torch_dtype=torch.float16,
26
- device_map="auto"
27
  )
28
 
 
 
 
29
  # =========================
30
  # Clean Text Using Gemma
31
  # =========================
32
  def clean_text(text):
33
 
 
 
34
  prompt = f"""
35
- You are an assistant that cleans Arabic speech transcripts.
 
 
 
36
 
37
- Tasks:
38
- - remove filler words (اممم، آآآ، يعني)
39
- - remove repeated words
40
- - keep the same meaning
41
- - return only the cleaned text
42
 
43
  Transcript:
44
  {text}
@@ -47,17 +51,19 @@ Transcript:
47
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
48
 
49
  with torch.no_grad():
 
50
  outputs = model.generate(
51
  **inputs,
52
- max_new_tokens=200,
53
- temperature=0.2
 
 
 
54
  )
55
 
56
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
57
 
58
- return result
59
-
60
- # =========================
61
  # ElevenLabs Speech To Text
62
  # =========================
63
 
 
22
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_id,
25
+ torch_dtype=torch.float32
 
26
  )
27
 
28
+ device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ model.to(device)
30
+
31
  # =========================
32
  # Clean Text Using Gemma
33
  # =========================
34
  def clean_text(text):
35
 
36
+ text = text[:1500]
37
+
38
  prompt = f"""
39
+ Clean this Arabic speech transcript.
40
+
41
+ Remove filler words like:
42
+ اممم، آآآ، يعني
43
 
44
+ Remove repeated words.
45
+ Keep the same meaning.
 
 
 
46
 
47
  Transcript:
48
  {text}
 
51
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
52
 
53
  with torch.no_grad():
54
+
55
  outputs = model.generate(
56
  **inputs,
57
+ max_new_tokens=120,
58
+ do_sample=True,
59
+ temperature=0.7,
60
+ top_p=0.9,
61
+ repetition_penalty=1.2
62
  )
63
 
64
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
 
66
+ return result# =========================
 
 
67
  # ElevenLabs Speech To Text
68
  # =========================
69