Final_App / app.py
ek-5's picture
Update app.py
aeb7d8a verified
raw
history blame
4 kB
mport os
import torch
import io
from fastapi import FastAPI, File, UploadFile
from transformers import AutoProcessor, AutoModelForCausalLM
from ultralytics import YOLO
from PIL import Image
import uvicorn
# --- 1. إعداد التطبيق والموديلات ---
app = FastAPI(title="YOLO + GIT Large: Final Visual Description API")
device = "cuda" if torch.cuda.is_available() else "cpu"
MY_MODEL_PATH = 'best.pt'
print(f"🔄 جاري التحميل على جهاز: {device}...")
# تحميل YOLO الخاص بكِ
try:
detection_model = YOLO(MY_MODEL_PATH)
print("✅ YOLO Model: Loaded successfully")
except Exception as e:
print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}")
detection_model = YOLO("yolov8n.pt")
# تحميل موديل الوصف GIT-Large
model_name = "microsoft/git-large"
processor = AutoProcessor.from_pretrained(model_name)
caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
print(f"✅ Caption Model: {model_name} Loaded")
@app.get("/")
def home():
return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"}
# --- 2. وظيفة المعالجة والتحليل ---
@app.post("/analyze")
async def analyze_image(file: UploadFile = File(...)):
# قراءة الصورة
data = await file.read()
original_image = Image.open(io.BytesIO(data)).convert("RGB")
# كشف الأجسام باستخدام YOLO
results = detection_model(original_image, conf=0.25)
integrated_results = []
for r in results:
for i, box in enumerate(r.boxes):
label = r.names[int(box.cls)]
coords = box.xyxy[0].tolist()
# قص العنصر مع هامش (Padding) 20 بكسل لرؤية الشكل واللون بوضوح
pad = 20
left = max(0, coords[0] - pad)
top = max(0, coords[1] - pad)
right = min(original_image.width, coords[2] + pad)
bottom = min(original_image.height, coords[3] + pad)
cropped_img = original_image.crop((left, top, right, bottom))
# --- استراتيجية الوصف الحر (بدون برومبت نصي مقيد) ---
# نترك الموديل يحلل الصورة بصرياً فقط
inputs = processor(images=cropped_img, return_tensors="pt").to(device)
generated_ids = caption_model.generate(
pixel_values=inputs.pixel_values,
max_length=60, # طول كافٍ لوصف اللون والشكل
min_length=12, # إجبار الموديل على التفصيل وعدم الاختصار
num_beams=5, # جودة عالية في اختيار الكلمات
repetition_penalty=1.5,
early_stopping=True
)
# فك التشفير للوصف الناتج
description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
integrated_results.append({
"object_id": i + 1,
"label": label,
"confidence": f"{float(box.conf[0]):.2f}",
"visual_description": f"Detected {label}: {description.strip()}"
})
# في حال لم يتم كشف أي شيء
if not integrated_results:
inputs = processor(images=original_image, return_tensors="pt").to(device)
out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50)
general_desc = processor.batch_decode(out, skip_special_tokens=True)[0]
return {
"message": "No specific objects detected by YOLO.",
"general_scene_description": general_desc
}
return {
"detected_count": len(integrated_results),
"results": integrated_results
}
# --- 3. تشغيل السيرفر ---
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)