Update app.py
Browse files
app.py
CHANGED
|
@@ -1,21 +1,24 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Blip2Processor, Blip2ForConditionalGeneration, WhisperProcessor, WhisperForConditionalGeneration
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from PIL import Image
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Initialize device
|
| 8 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 9 |
|
| 10 |
# Text model: Mistral 7B
|
| 11 |
-
mistral_model_name = "mistralai/
|
| 12 |
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
|
| 13 |
-
mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name).to(device)
|
| 14 |
|
| 15 |
# Image model: BLIP-2
|
| 16 |
blip_model_name = "Salesforce/blip2-opt-2.7b"
|
| 17 |
blip_processor = Blip2Processor.from_pretrained(blip_model_name)
|
| 18 |
-
blip_model = Blip2ForConditionalGeneration.from_pretrained(blip_model_name).to(device)
|
| 19 |
|
| 20 |
# Speech-to-text: Whisper
|
| 21 |
whisper_model_name = "openai/whisper-small"
|
|
@@ -25,23 +28,28 @@ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_na
|
|
| 25 |
# Text-to-speech: Coqui TTS
|
| 26 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
| 27 |
|
|
|
|
| 28 |
def generate_text_response(prompt):
|
| 29 |
try:
|
| 30 |
-
|
|
|
|
| 31 |
outputs = mistral_model.generate(**inputs, max_length=200, num_return_sequences=1)
|
| 32 |
return mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 33 |
except Exception as e:
|
| 34 |
return f"خطأ في معالجة النص: {str(e)}"
|
| 35 |
|
|
|
|
| 36 |
def analyze_image(image, question=None):
|
| 37 |
try:
|
| 38 |
image = Image.fromarray(image).convert("RGB")
|
| 39 |
-
|
|
|
|
| 40 |
outputs = blip_model.generate(**inputs)
|
| 41 |
return blip_processor.decode(outputs[0], skip_special_tokens=True)
|
| 42 |
except Exception as e:
|
| 43 |
return f"خطأ في تحليل الصورة: {str(e)}"
|
| 44 |
|
|
|
|
| 45 |
def process_audio(audio):
|
| 46 |
try:
|
| 47 |
sample_rate, audio_data = audio
|
|
@@ -56,26 +64,46 @@ def process_audio(audio):
|
|
| 56 |
except Exception as e:
|
| 57 |
return f"خطأ في معالجة الصوت: {str(e)}", "", None
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
text_input = gr.Textbox(label="اكتب سؤالك أو رسالتك")
|
| 67 |
text_output = gr.Textbox(label="الرد")
|
| 68 |
text_submit = gr.Button("إرسال")
|
| 69 |
text_submit.click(fn=generate_text_response, inputs=text_input, outputs=text_output)
|
| 70 |
|
| 71 |
-
with gr.Tab("تحليل الصور"):
|
| 72 |
image_input = gr.Image(label="ارفع صورة")
|
| 73 |
image_question = gr.Textbox(label="اختياري: اسأل سؤال عن الصورة")
|
| 74 |
image_output = gr.Textbox(label="الوصف أو الإجابة")
|
| 75 |
image_submit = gr.Button("تحليل الصورة")
|
| 76 |
image_submit.click(fn=analyze_image, inputs=[image_input, image_question], outputs=image_output)
|
| 77 |
|
| 78 |
-
with gr.Tab("التفاعل الصوتي"):
|
| 79 |
audio_input = gr.Audio(source="microphone", label="سجّل رسالتك")
|
| 80 |
audio_transcription = gr.Textbox(label="النص المستخرج")
|
| 81 |
audio_text_response = gr.Textbox(label="الرد النصي")
|
|
@@ -83,4 +111,10 @@ with gr.Blocks(css=".gradio-container {background-color: #f0f4f8; font-family: A
|
|
| 83 |
audio_submit = gr.Button("معالجة الصوت")
|
| 84 |
audio_submit.click(fn=process_audio, inputs=audio_input, outputs=[audio_transcription, audio_text_response, audio_output])
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Blip2Processor, Blip2ForConditionalGeneration, WhisperProcessor, WhisperForConditionalGeneration
|
| 4 |
+
from TTS.api import TTS
|
| 5 |
import numpy as np
|
| 6 |
from PIL import Image
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
import pandas as pd
|
| 9 |
|
| 10 |
# Initialize device
|
| 11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
|
| 13 |
# Text model: Mistral 7B
|
| 14 |
+
mistral_model_name = "mistralai/Mistral-7B-Instruct-v0.1"
|
| 15 |
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
|
| 16 |
+
mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)
|
| 17 |
|
| 18 |
# Image model: BLIP-2
|
| 19 |
blip_model_name = "Salesforce/blip2-opt-2.7b"
|
| 20 |
blip_processor = Blip2Processor.from_pretrained(blip_model_name)
|
| 21 |
+
blip_model = Blip2ForConditionalGeneration.from_pretrained(blip_model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)
|
| 22 |
|
| 23 |
# Speech-to-text: Whisper
|
| 24 |
whisper_model_name = "openai/whisper-small"
|
|
|
|
| 28 |
# Text-to-speech: Coqui TTS
|
| 29 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
| 30 |
|
| 31 |
+
# === Text generation ===
|
| 32 |
def generate_text_response(prompt):
|
| 33 |
try:
|
| 34 |
+
friendly_prompt = f"أجب على السؤال التالي بطريقة ودية وواضحة:\n{prompt}"
|
| 35 |
+
inputs = mistral_tokenizer(friendly_prompt, return_tensors="pt").to(device)
|
| 36 |
outputs = mistral_model.generate(**inputs, max_length=200, num_return_sequences=1)
|
| 37 |
return mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 38 |
except Exception as e:
|
| 39 |
return f"خطأ في معالجة النص: {str(e)}"
|
| 40 |
|
| 41 |
+
# === Image analysis ===
|
| 42 |
def analyze_image(image, question=None):
|
| 43 |
try:
|
| 44 |
image = Image.fromarray(image).convert("RGB")
|
| 45 |
+
prompt = question if question else "صف محتوى الصورة بالتفصيل"
|
| 46 |
+
inputs = blip_processor(images=image, text=prompt, return_tensors="pt").to(device)
|
| 47 |
outputs = blip_model.generate(**inputs)
|
| 48 |
return blip_processor.decode(outputs[0], skip_special_tokens=True)
|
| 49 |
except Exception as e:
|
| 50 |
return f"خطأ في تحليل الصورة: {str(e)}"
|
| 51 |
|
| 52 |
+
# === Audio processing ===
|
| 53 |
def process_audio(audio):
|
| 54 |
try:
|
| 55 |
sample_rate, audio_data = audio
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
return f"خطأ في معالجة الصوت: {str(e)}", "", None
|
| 66 |
|
| 67 |
+
# === File processing ===
|
| 68 |
+
def process_file(file):
|
| 69 |
+
try:
|
| 70 |
+
if file.name.endswith(".pdf"):
|
| 71 |
+
with fitz.open(file.name) as doc:
|
| 72 |
+
text = "\n".join(page.get_text() for page in doc)
|
| 73 |
+
elif file.name.endswith((".xlsx", ".xls")):
|
| 74 |
+
df = pd.read_excel(file.name)
|
| 75 |
+
text = df.to_string()
|
| 76 |
+
elif file.name.endswith(".csv"):
|
| 77 |
+
df = pd.read_csv(file.name)
|
| 78 |
+
text = df.to_string()
|
| 79 |
+
else:
|
| 80 |
+
return "❌ نوع الملف غير مدعوم حالياً."
|
| 81 |
+
|
| 82 |
+
response = generate_text_response(f"الملف يحتوي على:\n{text}\n\nلخص المحتوى.")
|
| 83 |
+
return response
|
| 84 |
+
except Exception as e:
|
| 85 |
+
return f"خطأ في قراءة الملف: {str(e)}"
|
| 86 |
|
| 87 |
+
# === Gradio Interface ===
|
| 88 |
+
with gr.Blocks(css=".gradio-container {background-color: #f0f4f8; font-family: Arial; color: #333; padding: 20px;}") as demo:
|
| 89 |
+
gr.Markdown("# 🤖 Kemo Chat - مساعد ذكي متعدد الوسائط")
|
| 90 |
+
gr.Markdown("🎯 تفاعل معي عبر النصوص، الصور، الصوت أو الملفات! يدعم العربية والإنجليزية.")
|
| 91 |
+
gr.Markdown("📁 يدعم الملفات: PDF، Excel، CSV\n🖼️ يدعم الوصف الذكي للصور\n🎙️ تحويل الصوت إلى نص والرد صوتياً")
|
| 92 |
+
|
| 93 |
+
with gr.Tab("💬 المحادثة النصية"):
|
| 94 |
text_input = gr.Textbox(label="اكتب سؤالك أو رسالتك")
|
| 95 |
text_output = gr.Textbox(label="الرد")
|
| 96 |
text_submit = gr.Button("إرسال")
|
| 97 |
text_submit.click(fn=generate_text_response, inputs=text_input, outputs=text_output)
|
| 98 |
|
| 99 |
+
with gr.Tab("🖼️ تحليل الصور"):
|
| 100 |
image_input = gr.Image(label="ارفع صورة")
|
| 101 |
image_question = gr.Textbox(label="اختياري: اسأل سؤال عن الصورة")
|
| 102 |
image_output = gr.Textbox(label="الوصف أو الإجابة")
|
| 103 |
image_submit = gr.Button("تحليل الصورة")
|
| 104 |
image_submit.click(fn=analyze_image, inputs=[image_input, image_question], outputs=image_output)
|
| 105 |
|
| 106 |
+
with gr.Tab("🎤 التفاعل الصوتي"):
|
| 107 |
audio_input = gr.Audio(source="microphone", label="سجّل رسالتك")
|
| 108 |
audio_transcription = gr.Textbox(label="النص المستخرج")
|
| 109 |
audio_text_response = gr.Textbox(label="الرد النصي")
|
|
|
|
| 111 |
audio_submit = gr.Button("معالجة الصوت")
|
| 112 |
audio_submit.click(fn=process_audio, inputs=audio_input, outputs=[audio_transcription, audio_text_response, audio_output])
|
| 113 |
|
| 114 |
+
with gr.Tab("📄 تحليل الملفات"):
|
| 115 |
+
file_input = gr.File(label="ارفع ملفك (PDF, Excel, CSV)")
|
| 116 |
+
file_output = gr.Textbox(label="الرد على محتوى الملف")
|
| 117 |
+
file_submit = gr.Button("تحليل الملف")
|
| 118 |
+
file_submit.click(fn=process_file, inputs=file_input, outputs=file_output)
|
| 119 |
+
|
| 120 |
demo.launch()
|