| import os |
| import numpy as np |
| import faiss |
| import torch |
| import gradio as gr |
| from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer |
| from sentence_transformers import SentenceTransformer |
| import librosa |
|
|
| device = "cpu" |
|
|
| |
| asr_pipeline = pipeline( |
| "automatic-speech-recognition", |
| model="openai/whisper-small", |
| chunk_length_s=30, |
| device=device, |
| ) |
| forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids( |
| language="arabic", task="transcribe" |
| ) |
|
|
| summ_model_name = "csebuetnlp/mT5_multilingual_XLSum" |
| summ_tokenizer = AutoTokenizer.from_pretrained(summ_model_name) |
| summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model_name) |
|
|
| embedding_model = SentenceTransformer("intfloat/multilingual-e5-base") |
| embedding_dim = embedding_model.get_sentence_embedding_dimension() |
|
|
| emotion_classifier = pipeline( |
| "audio-classification", |
| model="Dpngtm/wav2vec2-emotion-recognition", |
| device=-1, |
| ) |
|
|
| |
| index = faiss.IndexFlatIP(embedding_dim) |
| text_segments = [] |
|
|
| KEYWORDS = { |
| "ุฐูุงุก ุงุตุทูุงุนู": "AI", "ุชุนูู
ุนู
ูู": "Deep Learning", |
| "ุดุจูุฉ ุนุตุจูุฉ": "Neural Network", "ุชุนูู
ุขูู": "Machine Learning", |
| "ู
ุนุงูุฌุฉ ุงููุบุงุช": "NLP", "ุฑุคูุฉ ุญุงุณูุจูุฉ": "Computer Vision", |
| "ุจูุงูุงุช": "Data", "ูู
ูุฐุฌ": "Model", "ุชุฏุฑูุจ": "Training", |
| "ุฎูุงุฑุฒู
ูุฉ": "Algorithm", "ุชุตููู": "Classification", |
| "ุงุณุชุฑุฌุงุน": "Retrieval", "ุชุญููู": "Analysis", |
| "ู
ุญุงุถุฑุฉ": "Lecture", "ุฌุงู
ุนุฉ": "University", |
| "ุจุญุซ": "Research", "ู
ุดุฑูุน": "Project", |
| } |
|
|
| EMOTION_ICONS = { |
| "happy": "๐", "sad": "๐ข", "angry": "๐ก", "neutral": "๐", |
| "calm": "๐", "fearful": "๐จ", "disgust": "๐คข", "surprised": "๐ฒ", |
| } |
|
|
|
|
| |
| def encode_texts(texts, prefix="passage: "): |
| prefixed = [prefix + t for t in texts] |
| embeddings = embedding_model.encode(prefixed, normalize_embeddings=True) |
| return np.array(embeddings).astype("float32") |
|
|
|
|
| def transcribe_audio(audio_path): |
| result = asr_pipeline( |
| audio_path, |
| return_timestamps=True, |
| generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, |
| ) |
| full_text = result["text"] |
| chunks = result.get("chunks", []) |
| if not chunks: |
| chunks = [{"text": full_text, "timestamp": (0.0, 0.0)}] |
| return full_text, chunks |
|
|
|
|
| def summarize_text(text, max_input=512, max_output=150): |
| inputs = summ_tokenizer( |
| [text.strip()], |
| max_length=max_input, |
| truncation=True, |
| padding="max_length", |
| return_tensors="pt", |
| ) |
| summary_ids = summ_model.generate( |
| inputs["input_ids"], |
| attention_mask=inputs["attention_mask"], |
| num_beams=2, |
| max_length=max_output, |
| early_stopping=True, |
| no_repeat_ngram_size=3, |
| ) |
| return summ_tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
|
|
|
| def detect_emotion(audio_path): |
| audio, sr = librosa.load(audio_path, sr=16000, duration=15.0) |
| predictions = emotion_classifier({"array": audio, "sampling_rate": sr}) |
| top = max(predictions, key=lambda x: x["score"]) |
| return top["label"], top["score"] |
|
|
|
|
| def detect_keywords(text): |
| found = [] |
| for ar, en in KEYWORDS.items(): |
| count = text.count(ar) |
| if count > 0: |
| found.append({"keyword_ar": ar, "keyword_en": en, "count": count}) |
| found.sort(key=lambda x: x["count"], reverse=True) |
| return found |
|
|
|
|
| def index_segments(chunks): |
| global index, text_segments |
| index = faiss.IndexFlatIP(embedding_dim) |
| text_segments = chunks |
| segment_texts = [c["text"] for c in chunks] |
| embeddings = encode_texts(segment_texts, prefix="passage: ") |
| index.add(embeddings) |
| return len(chunks) |
|
|
|
|
| def search_query(query, top_k=3): |
| if index.ntotal == 0: |
| return "ูู
ูุชู
ุชุญู
ูู ุฃู ู
ูู ุตูุชู ุจุนุฏ. ูู
ุจุฑูุน ู
ูู ุฃููุงู." |
| query_emb = encode_texts([query], prefix="query: ") |
| scores, indices = index.search(query_emb, k=min(top_k, index.ntotal)) |
| results = [] |
| for rank, (i, score) in enumerate(zip(indices[0], scores[0]), 1): |
| if i < len(text_segments): |
| seg = text_segments[i] |
| start = seg["timestamp"][0] or 0.0 |
| end = seg["timestamp"][1] or 0.0 |
| sm, ss = int(start // 60), int(start % 60) |
| em, es = int(end // 60), int(end % 60) |
| time_str = f"{sm}:{ss:02d} - {em}:{es:02d}" |
| results.append( |
| f"**#{rank}** | ุชุทุงุจู: {score * 100:.1f}% | โฑ๏ธ {time_str}\n> {seg['text']}" |
| ) |
| return "\n\n".join(results) if results else "ูุง ุชูุฌุฏ ูุชุงุฆุฌ" |
|
|
|
|
| |
| def process_audio(audio_path, progress=gr.Progress()): |
| if audio_path is None: |
| raise gr.Error("ูุฑุฌู ุฑูุน ู
ูู ุตูุชู ุฃููุงู") |
|
|
| progress(0.05, desc="ุชุญููู ุงูู
ุดุงุนุฑ...") |
| emotion_label, emotion_conf = detect_emotion(audio_path) |
| icon = EMOTION_ICONS.get(emotion_label.lower(), "๐ต") |
| emotion_result = f"{icon} {emotion_label} ({emotion_conf * 100:.1f}%)" |
|
|
| progress(0.25, desc="ุชุญููู ุงูุตูุช ุฅูู ูุต...") |
| full_text, chunks = transcribe_audio(audio_path) |
|
|
| progress(0.60, desc="ุฅูุดุงุก ุงูู
ูุฎุต...") |
| summary = summarize_text(full_text) |
|
|
| progress(0.80, desc="ููุฑุณุฉ ุงูู
ูุงุทุน...") |
| n_segments = index_segments(chunks) |
|
|
| progress(0.90, desc="ุงุณุชุฎุฑุงุฌ ุงูููู
ุงุช ุงูู
ูุชุงุญูุฉ...") |
| keywords = detect_keywords(full_text) |
| kw_text = " ".join( |
| [f"๐ {k['keyword_ar']} ({k['keyword_en']}) ร{k['count']}" for k in keywords] |
| ) |
| if not kw_text: |
| kw_text = "ูู
ูุชู
ุงูุนุซูุฑ ุนูู ููู
ุงุช ู
ูุชุงุญูุฉ" |
|
|
| seg_info = f"โ
ุชู
ููุฑุณุฉ {n_segments} ู
ูุทุน ููุจุญุซ ุงูุฏูุงูู" |
|
|
| progress(1.0, desc="ุชู
!") |
| return emotion_result, full_text, summary, kw_text, seg_info |
|
|
|
|
| def do_search(query): |
| if not query or not query.strip(): |
| return "ูุฑุฌู ุฅุฏุฎุงู ุงุณุชุนูุงู
ููุจุญุซ" |
| return search_query(query.strip(), top_k=5) |
|
|
|
|
| |
| CUSTOM_CSS = """ |
| .gradio-container { |
| max-width: 1200px !important; |
| font-family: 'Inter', sans-serif !important; |
| } |
| .main-title { |
| text-align: center; |
| background: linear-gradient(135deg, #49f4c8, #7c3aed); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| font-size: 2.5rem; |
| font-weight: 800; |
| margin-bottom: 0.5rem; |
| } |
| .sub-title { |
| text-align: center; |
| color: #a0abc2; |
| font-size: 1.1rem; |
| margin-bottom: 2rem; |
| } |
| """ |
|
|
| with gr.Blocks( |
| theme=gr.themes.Base( |
| primary_hue=gr.themes.colors.emerald, |
| secondary_hue=gr.themes.colors.purple, |
| neutral_hue=gr.themes.colors.slate, |
| font=gr.themes.GoogleFont("Inter"), |
| ), |
| css=CUSTOM_CSS, |
| title="ArabEdu", |
| ) as demo: |
|
|
| gr.HTML( |
| """ |
| <div class="main-title">ArabEdu</div> |
| <div class="sub-title"> |
| ูุธุงู
ููู
ุงูู
ุญุงุถุฑุงุช ุงูุนุฑุจูุฉ โ ุญููู ู
ุญุงุถุฑุงุชู ุงูุตูุชูุฉ ุฅูู ูุตูุต ุฐููุฉ ูู
ูุฎุตุงุช ุฏูููุฉ |
| </div> |
| """ |
| ) |
|
|
| with gr.Row(): |
| audio_input = gr.Audio( |
| label="๐ ุฑูุน ุงูู
ูู ุงูุตูุชู", |
| type="filepath", |
| sources=["upload", "microphone"], |
| ) |
|
|
| process_btn = gr.Button( |
| "๐ ู
ุนุงูุฌุฉ ุงูู
ูู ุงูุตูุชู", |
| variant="primary", |
| size="lg", |
| ) |
|
|
| with gr.Row(): |
| emotion_output = gr.Textbox( |
| label="๐ญ ุชุญููู ุงูู
ุดุงุนุฑ ุงูุตูุชูุฉ", |
| interactive=False, |
| scale=1, |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| transcript_output = gr.Textbox( |
| label="๐ ุงููุต ุงููุงู
ู", |
| interactive=False, |
| lines=10, |
| rtl=True, |
| ) |
| with gr.Column(scale=1): |
| summary_output = gr.Textbox( |
| label="๐ ุงูู
ูุฎุต", |
| interactive=False, |
| lines=6, |
| rtl=True, |
| ) |
| keywords_output = gr.Textbox( |
| label="๐ ุงูููู
ุงุช ุงูู
ูุชุงุญูุฉ", |
| interactive=False, |
| lines=3, |
| rtl=True, |
| ) |
|
|
| seg_info_output = gr.Textbox( |
| label="ููุฑุณุฉ", |
| interactive=False, |
| visible=True, |
| ) |
|
|
| gr.Markdown("---") |
| gr.Markdown("### ๐ ุงูุจุญุซ ุงูุฏูุงูู ูู ุงูู
ุญุชูู") |
|
|
| with gr.Row(): |
| search_input = gr.Textbox( |
| label="ุงุจุญุซ ุนู ู
ูุถูุน ู
ุนูู ูู ุงูุชุณุฌูู", |
| placeholder="ู
ุซุงู: ู
ุง ูู ุงูุฐูุงุก ุงูุงุตุทูุงุนูุ", |
| scale=4, |
| rtl=True, |
| ) |
| search_btn = gr.Button("๐ ุจุญุซ", variant="secondary", scale=1) |
|
|
| search_output = gr.Markdown(label="ูุชุงุฆุฌ ุงูุจุญุซ", rtl=True) |
|
|
| process_btn.click( |
| fn=process_audio, |
| inputs=[audio_input], |
| outputs=[ |
| emotion_output, |
| transcript_output, |
| summary_output, |
| keywords_output, |
| seg_info_output, |
| ], |
| ) |
|
|
| search_btn.click( |
| fn=do_search, |
| inputs=[search_input], |
| outputs=[search_output], |
| ) |
|
|
| search_input.submit( |
| fn=do_search, |
| inputs=[search_input], |
| outputs=[search_output], |
| ) |
|
|
| demo.queue() |
| demo.launch() |
|
|