Dibiddo commited on
Commit
1827af1
ยท
verified ยท
1 Parent(s): 4f0fc7d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ from datasets import load_dataset
4
+ import spacy
5
+ import gradio as gr
6
+
7
+ # ่จญ็ฝฎ่จญๅ‚™
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
10
+
11
+ # Whisper ๆจกๅž‹ๅˆๅง‹ๅŒ–๏ผˆ่ชž้Ÿณ่ฝ‰ๆ–‡ๅญ—๏ผ‰
12
+ whisper_model_id = "openai/whisper-large-v3"
13
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
+ whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
15
+ )
16
+ whisper_model.to(device)
17
+ whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)
18
+
19
+ whisper_pipe = pipeline(
20
+ "automatic-speech-recognition",
21
+ model=whisper_model,
22
+ tokenizer=whisper_processor.tokenizer,
23
+ feature_extractor=whisper_processor.feature_extractor,
24
+ device=device,
25
+ )
26
+
27
+ # DeepSeek ๆจกๅž‹ๅˆๅง‹ๅŒ–๏ผˆๆ–‡ๆœฌ็”Ÿๆˆ๏ผ‰
28
+ deepseek_pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1", trust_remote_code=True)
29
+
30
+ # spaCy ๅˆๅง‹ๅŒ–๏ผˆๆ–‡ๆœฌๅˆ†้กž่ˆ‡ๆจ™็ฑค๏ผ‰
31
+ nlp = spacy.load("en_core_web_sm")
32
+
33
+ # ๅฎš็พฉ่™•็†ๅ‡ฝๆ•ธ
34
+ def process_audio(audio_file):
35
+ # ่ชž้Ÿณ่ฝ‰ๆ–‡ๅญ—
36
+ result = whisper_pipe(audio_file)["text"]
37
+
38
+ # ไฝฟ็”จ DeepSeek ็”Ÿๆˆๅ›žๆ‡‰
39
+ messages = [{"role": "user", "content": result}]
40
+ deepseek_response = deepseek_pipe(messages)[0]["generated_text"]
41
+
42
+ # ไฝฟ็”จ spaCy ๅˆ†ๆžๆ–‡ๆœฌ
43
+ doc = nlp(deepseek_response)
44
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
45
+
46
+ return result, deepseek_response, entities
47
+
48
+ # Gradio ็•Œ้ข่จญ่จˆ
49
+ def interface(audio_file):
50
+ transcription, response, entities = process_audio(audio_file)
51
+ return {
52
+ "Transcription (Whisper)": transcription,
53
+ "AI Response (DeepSeek)": response,
54
+ "Extracted Entities (spaCy)": entities,
55
+ }
56
+
57
+ # Gradio ๆ‡‰็”จ็จ‹ๅบ
58
+ with gr.Blocks() as app:
59
+ gr.Markdown("# AI ๅฎขๆœ่‡ชๅ‹•ๅŒ–็ณป็ตฑ")
60
+
61
+ with gr.Row():
62
+ audio_input = gr.Audio(source="microphone", type="filepath", label="ไธŠๅ‚ณ่ชž้Ÿณ")
63
+ output_text = gr.JSON(label="็ตๆžœ")
64
+
65
+ submit_button = gr.Button("ๆไบค")
66
+ submit_button.click(fn=interface, inputs=audio_input, outputs=output_text)
67
+
68
+ # ๅ•Ÿๅ‹•ๆ‡‰็”จ็จ‹ๅบ๏ผˆๆœฌๅœฐๆธฌ่ฉฆๆ™‚ไฝฟ็”จ๏ผ‰
69
+ if __name__ == "__main__":
70
+ app.launch()
71
+
72
+ # ้ƒจ็ฝฒๅˆฐ Hugging Face Spaces ๆ™‚๏ผŒๅฐ‡ `app.launch()` ๆ›ฟๆ›็‚บ `app`