|
|
import copy |
|
|
import os |
|
|
import requests |
|
|
import io |
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
try: |
|
|
import decord |
|
|
decord.bridge.set_bridge('torch') |
|
|
except ImportError: |
|
|
decord = None |
|
|
print("Warning: 'decord' module not found. Video processing will not work.") |
|
|
|
|
|
|
|
|
try: |
|
|
import librosa |
|
|
except ImportError: |
|
|
librosa = None |
|
|
|
|
|
def _load_image(image_path): |
|
|
"""تحميل الصورة من رابط أو مسار محلي وتحويلها إلى RGB""" |
|
|
if image_path.startswith("http://") or image_path.startswith("https://"): |
|
|
response = requests.get(image_path, timeout=10) |
|
|
image = Image.open(io.BytesIO(response.content)) |
|
|
else: |
|
|
image = Image.open(image_path) |
|
|
return image.convert("RGB") |
|
|
|
|
|
def _load_audio(audio_path, target_sr=16000): |
|
|
"""تحميل الصوت وإعادة تعيين معدل الترميز (Sampling Rate)""" |
|
|
if audio_path.startswith("http://") or audio_path.startswith("https://"): |
|
|
response = requests.get(audio_path, timeout=10) |
|
|
|
|
|
audio_data, sr = sf.read(io.BytesIO(response.content)) |
|
|
else: |
|
|
audio_data, sr = sf.read(audio_path) |
|
|
|
|
|
|
|
|
if len(audio_data.shape) > 1: |
|
|
audio_data = audio_data.mean(axis=1) |
|
|
|
|
|
|
|
|
if librosa and sr != target_sr: |
|
|
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=target_sr) |
|
|
|
|
|
return audio_data |
|
|
|
|
|
def _load_video(video_path, n_frames=8, use_audio=True): |
|
|
"""معالجة الفيديو: استخراج الإطارات والصوت""" |
|
|
if decord is None: |
|
|
raise ImportError("Please install 'decord' to support video processing.") |
|
|
|
|
|
|
|
|
if video_path.startswith("http"): |
|
|
|
|
|
response = requests.get(video_path, stream=True) |
|
|
temp_filename = "temp_video.mp4" |
|
|
with open(temp_filename, 'wb') as f: |
|
|
for chunk in response.iter_content(chunk_size=1024): |
|
|
if chunk: |
|
|
f.write(chunk) |
|
|
vr = decord.VideoReader(temp_filename) |
|
|
else: |
|
|
vr = decord.VideoReader(video_path) |
|
|
|
|
|
|
|
|
total_frames = len(vr) |
|
|
|
|
|
frame_indices = np.linspace(0, total_frames - 1, n_frames, dtype=int) |
|
|
frames = vr.get_batch(frame_indices).asnumpy() |
|
|
|
|
|
pil_frames = [Image.fromarray(frame) for frame in frames] |
|
|
|
|
|
audio_data = None |
|
|
if use_audio: |
|
|
|
|
|
|
|
|
|
|
|
pass |
|
|
|
|
|
return pil_frames, audio_data |
|
|
|
|
|
def process_mm_info(conversation, use_audio_in_video=True): |
|
|
""" |
|
|
الدالة الرئيسية لمعالجة الوسائط المتعددة. |
|
|
تقوم بتحويل الروابط النصية إلى كائنات بيانات (Tensors/Images) يفهمها النموذج. |
|
|
""" |
|
|
conversation = copy.deepcopy(conversation) |
|
|
audios = [] |
|
|
images = [] |
|
|
videos = [] |
|
|
|
|
|
for message in conversation: |
|
|
if "content" in message and isinstance(message["content"], list): |
|
|
for item in message["content"]: |
|
|
try: |
|
|
if item["type"] == "audio": |
|
|
|
|
|
audio_data = _load_audio(item["audio"]) |
|
|
audios.append(audio_data) |
|
|
|
|
|
elif item["type"] == "image": |
|
|
|
|
|
image_data = _load_image(item["image"]) |
|
|
images.append(image_data) |
|
|
|
|
|
elif item["type"] == "video": |
|
|
|
|
|
video_frames, video_audio = _load_video( |
|
|
item["video"], |
|
|
use_audio=use_audio_in_video |
|
|
) |
|
|
videos.append(video_frames) |
|
|
if use_audio_in_video and video_audio is not None: |
|
|
audios.append(video_audio) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing {item['type']}: {e}") |
|
|
|
|
|
pass |
|
|
|
|
|
return audios, images, videos |