import os import numpy as np import faiss import torch import gradio as gr from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer from sentence_transformers import SentenceTransformer import librosa device = "cpu" # --------------- Load Models --------------- asr_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30, device=device, ) forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids( language="arabic", task="transcribe" ) summ_model_name = "csebuetnlp/mT5_multilingual_XLSum" summ_tokenizer = AutoTokenizer.from_pretrained(summ_model_name) summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model_name) embedding_model = SentenceTransformer("intfloat/multilingual-e5-base") embedding_dim = embedding_model.get_sentence_embedding_dimension() emotion_classifier = pipeline( "audio-classification", model="Dpngtm/wav2vec2-emotion-recognition", device=-1, ) # --------------- FAISS Index --------------- index = faiss.IndexFlatIP(embedding_dim) text_segments = [] KEYWORDS = { "ذكاء اصطناعي": "AI", "تعلم عميق": "Deep Learning", "شبكة عصبية": "Neural Network", "تعلم آلي": "Machine Learning", "معالجة اللغات": "NLP", "رؤية حاسوبية": "Computer Vision", "بيانات": "Data", "نموذج": "Model", "تدريب": "Training", "خوارزمية": "Algorithm", "تصنيف": "Classification", "استرجاع": "Retrieval", "تحليل": "Analysis", "محاضرة": "Lecture", "جامعة": "University", "بحث": "Research", "مشروع": "Project", } EMOTION_ICONS = { "happy": "😊", "sad": "😢", "angry": "😡", "neutral": "😐", "calm": "😌", "fearful": "😨", "disgust": "🤢", "surprised": "😲", } # --------------- Pipeline Functions --------------- def encode_texts(texts, prefix="passage: "): prefixed = [prefix + t for t in texts] embeddings = embedding_model.encode(prefixed, normalize_embeddings=True) return np.array(embeddings).astype("float32") def transcribe_audio(audio_path): result = asr_pipeline( audio_path, return_timestamps=True, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, ) full_text = result["text"] chunks = result.get("chunks", []) if not chunks: chunks = [{"text": full_text, "timestamp": (0.0, 0.0)}] return full_text, chunks def summarize_text(text, max_input=512, max_output=150): inputs = summ_tokenizer( [text.strip()], max_length=max_input, truncation=True, padding="max_length", return_tensors="pt", ) summary_ids = summ_model.generate( inputs["input_ids"], attention_mask=inputs["attention_mask"], num_beams=2, max_length=max_output, early_stopping=True, no_repeat_ngram_size=3, ) return summ_tokenizer.decode(summary_ids[0], skip_special_tokens=True) def detect_emotion(audio_path): audio, sr = librosa.load(audio_path, sr=16000, duration=15.0) predictions = emotion_classifier({"array": audio, "sampling_rate": sr}) top = max(predictions, key=lambda x: x["score"]) return top["label"], top["score"] def detect_keywords(text): found = [] for ar, en in KEYWORDS.items(): count = text.count(ar) if count > 0: found.append({"keyword_ar": ar, "keyword_en": en, "count": count}) found.sort(key=lambda x: x["count"], reverse=True) return found def index_segments(chunks): global index, text_segments index = faiss.IndexFlatIP(embedding_dim) text_segments = chunks segment_texts = [c["text"] for c in chunks] embeddings = encode_texts(segment_texts, prefix="passage: ") index.add(embeddings) return len(chunks) def search_query(query, top_k=3): if index.ntotal == 0: return "لم يتم تحميل أي ملف صوتي بعد. قم برفع ملف أولاً." query_emb = encode_texts([query], prefix="query: ") scores, indices = index.search(query_emb, k=min(top_k, index.ntotal)) results = [] for rank, (i, score) in enumerate(zip(indices[0], scores[0]), 1): if i < len(text_segments): seg = text_segments[i] start = seg["timestamp"][0] or 0.0 end = seg["timestamp"][1] or 0.0 sm, ss = int(start // 60), int(start % 60) em, es = int(end // 60), int(end % 60) time_str = f"{sm}:{ss:02d} - {em}:{es:02d}" results.append( f"**#{rank}** | تطابق: {score * 100:.1f}% | ⏱️ {time_str}\n> {seg['text']}" ) return "\n\n".join(results) if results else "لا توجد نتائج" # --------------- Main Process --------------- def process_audio(audio_path, progress=gr.Progress()): if audio_path is None: raise gr.Error("يرجى رفع ملف صوتي أولاً") progress(0.05, desc="تحليل المشاعر...") emotion_label, emotion_conf = detect_emotion(audio_path) icon = EMOTION_ICONS.get(emotion_label.lower(), "🎵") emotion_result = f"{icon} {emotion_label} ({emotion_conf * 100:.1f}%)" progress(0.25, desc="تحويل الصوت إلى نص...") full_text, chunks = transcribe_audio(audio_path) progress(0.60, desc="إنشاء الملخص...") summary = summarize_text(full_text) progress(0.80, desc="فهرسة المقاطع...") n_segments = index_segments(chunks) progress(0.90, desc="استخراج الكلمات المفتاحية...") keywords = detect_keywords(full_text) kw_text = " ".join( [f"🔑 {k['keyword_ar']} ({k['keyword_en']}) ×{k['count']}" for k in keywords] ) if not kw_text: kw_text = "لم يتم العثور على كلمات مفتاحية" seg_info = f"✅ تم فهرسة {n_segments} مقطع للبحث الدلالي" progress(1.0, desc="تم!") return emotion_result, full_text, summary, kw_text, seg_info def do_search(query): if not query or not query.strip(): return "يرجى إدخال استعلام للبحث" return search_query(query.strip(), top_k=5) # --------------- Gradio UI --------------- CUSTOM_CSS = """ .gradio-container { max-width: 1200px !important; font-family: 'Inter', sans-serif !important; } .main-title { text-align: center; background: linear-gradient(135deg, #49f4c8, #7c3aed); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-size: 2.5rem; font-weight: 800; margin-bottom: 0.5rem; } .sub-title { text-align: center; color: #a0abc2; font-size: 1.1rem; margin-bottom: 2rem; } """ with gr.Blocks( theme=gr.themes.Base( primary_hue=gr.themes.colors.emerald, secondary_hue=gr.themes.colors.purple, neutral_hue=gr.themes.colors.slate, font=gr.themes.GoogleFont("Inter"), ), css=CUSTOM_CSS, title="ArabEdu", ) as demo: gr.HTML( """
ArabEdu
نظام فهم المحاضرات العربية — حوّل محاضراتك الصوتية إلى نصوص ذكية وملخصات دقيقة
""" ) with gr.Row(): audio_input = gr.Audio( label="📁 رفع الملف الصوتي", type="filepath", sources=["upload", "microphone"], ) process_btn = gr.Button( "🚀 معالجة الملف الصوتي", variant="primary", size="lg", ) with gr.Row(): emotion_output = gr.Textbox( label="🎭 تحليل المشاعر الصوتية", interactive=False, scale=1, ) with gr.Row(): with gr.Column(scale=2): transcript_output = gr.Textbox( label="📝 النص الكامل", interactive=False, lines=10, rtl=True, ) with gr.Column(scale=1): summary_output = gr.Textbox( label="📋 الملخص", interactive=False, lines=6, rtl=True, ) keywords_output = gr.Textbox( label="🔑 الكلمات المفتاحية", interactive=False, lines=3, rtl=True, ) seg_info_output = gr.Textbox( label="فهرسة", interactive=False, visible=True, ) gr.Markdown("---") gr.Markdown("### 🔍 البحث الدلالي في المحتوى") with gr.Row(): search_input = gr.Textbox( label="ابحث عن موضوع معين في التسجيل", placeholder="مثال: ما هو الذكاء الاصطناعي؟", scale=4, rtl=True, ) search_btn = gr.Button("🔍 بحث", variant="secondary", scale=1) search_output = gr.Markdown(label="نتائج البحث", rtl=True) process_btn.click( fn=process_audio, inputs=[audio_input], outputs=[ emotion_output, transcript_output, summary_output, keywords_output, seg_info_output, ], ) search_btn.click( fn=do_search, inputs=[search_input], outputs=[search_output], ) search_input.submit( fn=do_search, inputs=[search_input], outputs=[search_output], ) demo.queue() demo.launch()