Spaces:

clementBE
/

image_classifier

Sleeping

App Files Files Community

clementBE commited on Sep 26, 2025

Commit

9cffd38

verified ·

1 Parent(s): 1ee281a

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -256

app.py CHANGED Viewed

@@ -1,264 +1,172 @@
 import os
-import zipfile
 import tempfile
-import requests
-import numpy as np
-import pandas as pd
-from PIL import Image
 import torch
-import torch.nn.functional as F
-from torchvision import transforms
-from torchvision.models import resnet50, ResNet50_Weights
-from sklearn.cluster import MiniBatchKMeans
-import matplotlib.pyplot as plt
-import io
 import gradio as gr
-# Face analysis
-from deepface import DeepFace
-import cv2
-# ---------------------------
-# Force CPU if no CUDA
-# ---------------------------
-if not torch.cuda.is_available():
-    os.environ["CUDA_VISIBLE_DEVICES"] = ""
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ---------------------------
-# Load ResNet50
-# ---------------------------
-weights = ResNet50_Weights.DEFAULT
-model = resnet50(weights=weights).to(device)
-model.eval()
-# ---------------------------
-# Transformations
-# ---------------------------
-transform = transforms.Compose([
-    transforms.Resize(256),
-    transforms.CenterCrop(224),
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                         std=[0.229, 0.224, 0.225]),
-])
-# ---------------------------
-# ImageNet labels
-# ---------------------------
-LABELS_URL = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
-imagenet_classes = [line.strip() for line in requests.get(LABELS_URL).text.splitlines()]
-# ---------------------------
-# Color utilities
-# ---------------------------
-BASIC_COLORS = {
-    "Red": (255, 0, 0),
-    "Green": (0, 255, 0),
-    "Blue": (0, 0, 255),
-    "Yellow": (255, 255, 0),
-    "Cyan": (0, 255, 255),
-    "Magenta": (255, 0, 255),
-    "Black": (0, 0, 0),
-    "White": (255, 255, 255),
-    "Gray": (128, 128, 128),
 }
-def closest_basic_color(rgb):
-    r, g, b = rgb
-    min_dist = float("inf")
-    closest_color = None
-    for name, (cr, cg, cb) in BASIC_COLORS.items():
-        dist = (r - cr) ** 2 + (g - cg) ** 2 + (b - cb) ** 2
-        if dist < min_dist:
-            min_dist = dist
-            closest_color = name
-    return closest_color
-def get_dominant_color(image, num_colors=5):
-    image = image.resize((100, 100))
-    pixels = np.array(image).reshape(-1, 3)
-    kmeans = MiniBatchKMeans(n_clusters=num_colors, random_state=0, n_init=5)
-    kmeans.fit(pixels)
-    dominant_color = kmeans.cluster_centers_[np.argmax(np.bincount(kmeans.labels_))]
-    dominant_color = tuple(dominant_color.astype(int))
-    hex_color = f"#{dominant_color[0]:02x}{dominant_color[1]:02x}{dominant_color[2]:02x}"
-    return dominant_color, hex_color
-# ---------------------------
-# Core function
-# ---------------------------
-def classify_zip_and_analyze_color(zip_file):
-    results = []
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
-            zip_ref.extractall(tmpdir)
-        for fname in sorted(os.listdir(tmpdir)):
-            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
-                img_path = os.path.join(tmpdir, fname)
-                try:
-                    image = Image.open(img_path).convert("RGB")
-                except Exception:
-                    continue
-                # Classification
-                input_tensor = transform(image).unsqueeze(0).to(device)
-                with torch.no_grad():
-                    output = model(input_tensor)
-                    probs = F.softmax(output, dim=1)[0]
-                top3_prob, top3_idx = torch.topk(probs, 3)
-                preds = [(imagenet_classes[idx], f"{prob.item()*100:.2f}%") for idx, prob in zip(top3_idx, top3_prob)]
-                # Dominant color
-                rgb, hex_color = get_dominant_color(image)
-                basic_color = closest_basic_color(rgb)
-                # ---------------------------
-                # Face detection & characterization
-                # ---------------------------
-                face_info = ""
-                try:
-                    img_cv2 = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-                    faces = DeepFace.analyze(img_cv2, actions=["age", "gender", "emotion"], enforce_detection=False)
-                    if isinstance(faces, list):  # multiple faces
-                        for f in faces:
-                            face_info += f"Age: {f['age']}, Gender: {f['gender']}, Gender Confidence: {f['gender_confidence']*100:.2f}, Emotion: {f['dominant_emotion']}; "
-                    else:  # single face
-                        face_info = f"Age: {faces['age']}, Gender: {faces['gender']}, Gender Confidence: {faces['gender_confidence']*100:.2f}, Emotion: {faces['dominant_emotion']}"
-                except Exception as e:
-                    face_info = "No face detected"
-                results.append((
-                    fname,
-                    ", ".join([p[0] for p in preds]),
-                    ", ".join([p[1] for p in preds]),
-                    hex_color,
-                    basic_color,
-                    face_info
-                ))
-    # Build dataframe
-    df = pd.DataFrame(results, columns=["Filename", "Top 3 Predictions", "Confidence", "Dominant Color", "Basic Color", "Face Info"])
-    # Save XLSX
-    out_xlsx = os.path.join(tempfile.gettempdir(), "results.xlsx")
-    df.to_excel(out_xlsx, index=False)
-    # ---------------------------
-    # Plot 1: Basic color frequency
-    # ---------------------------
-    fig1, ax1 = plt.subplots()
-    color_counts = df["Basic Color"].value_counts()
-    ax1.bar(color_counts.index, color_counts.values, color="skyblue")
-    ax1.set_title("Basic Color Frequency")
-    ax1.set_ylabel("Count")
-    buf1 = io.BytesIO()
-    plt.savefig(buf1, format="png")
-    plt.close(fig1)
-    buf1.seek(0)
-    plot1_img = Image.open(buf1)
-    # ---------------------------
-    # Plot 2: Top prediction distribution
-    # ---------------------------
-    fig2, ax2 = plt.subplots()
-    preds_flat = []
-    for p in df["Top 3 Predictions"]:
-        preds_flat.extend(p.split(", "))
-    pred_counts = pd.Series(preds_flat).value_counts().head(20)
-    ax2.barh(pred_counts.index[::-1], pred_counts.values[::-1], color="salmon")
-    ax2.set_title("Top Prediction Distribution")
-    ax2.set_xlabel("Count")
-    buf2 = io.BytesIO()
-    plt.savefig(buf2, format="png", bbox_inches="tight")
-    plt.close(fig2)
-    buf2.seek(0)
-    plot2_img = Image.open(buf2)
-    # ---------------------------
-    # Extract age and gender (confidence ≤ 80%)
-    # ---------------------------
-    ages = []
-    gender_confidence = {"Man": 0, "Woman": 0}
-    for info in df["Face Info"]:
-        if info != "No face detected":
-            for face_str in info.split(";"):
-                face_str = face_str.strip()
-                if face_str:
-                    # Age
-                    age_part = face_str.split(",")[0]
-                    age = int(age_part.replace("Age:", "").strip())
-                    ages.append(age)
-                    # Gender and confidence
-                    gender_part = face_str.split(",")[1]
-                    gender = gender_part.replace("Gender:", "").strip()
-                    # Extract confidence
-                    conf = 1.0
-                    for part in face_str.split(","):
-                        if "Gender Confidence:" in part:
-                            conf = float(part.split("Gender Confidence:")[1].strip()) / 100  # convert % to 0-1
-                    # Only include if confidence ≤ 0.8
-                    if conf <= 0.8:
-                        if gender in gender_confidence:
-                            gender_confidence[gender] += conf
-                        else:
-                            gender_confidence[gender] = conf
-    # ---------------------------
-    # Plot 3: Gender distribution (confidence ≤ 80%)
-    # ---------------------------
-    fig3, ax3 = plt.subplots()
-    ax3.bar(gender_confidence.keys(), gender_confidence.values(), color=["lightblue", "pink"])
-    ax3.set_title("Gender Distribution (Confidence ≤ 80%)")
-    ax3.set_ylabel("Sum of Confidence")
-    buf3 = io.BytesIO()
-    plt.savefig(buf3, format="png")
-    plt.close(fig3)
-    buf3.seek(0)
-    plot3_img = Image.open(buf3)
-    # ---------------------------
-    # Plot 4: Age distribution
-    # ---------------------------
-    fig4, ax4 = plt.subplots()
-    ax4.hist(ages, bins=range(0, 101, 5), color="lightgreen", edgecolor="black")
-    ax4.set_title("Age Distribution")
-    ax4.set_xlabel("Age")
-    ax4.set_ylabel("Count")
-    buf4 = io.BytesIO()
-    plt.savefig(buf4, format="png")
-    plt.close(fig4)
-    buf4.seek(0)
-    plot4_img = Image.open(buf4)
-    return df, out_xlsx, plot1_img, plot2_img, plot3_img, plot4_img
-# ---------------------------
-# Gradio Interface
-# ---------------------------
-demo = gr.Interface(
-    fn=classify_zip_and_analyze_color,
-    inputs=gr.File(file_types=[".zip"], label="Upload ZIP of images"),
-    outputs=[
-        gr.Dataframe(headers=["Filename", "Top 3 Predictions", "Confidence", "Dominant Color", "Basic Color", "Face Info"]),
-        gr.File(label="Download XLSX"),
-        gr.Image(type="pil", label="Basic Color Frequency"),
-        gr.Image(type="pil", label="Top Prediction Distribution"),
-        gr.Image(type="pil", label="Gender Distribution (≤80% Confidence)"),
-        gr.Image(type="pil", label="Age Distribution"),
-    ],
-    title="Image Classifier with Color & Face Analysis",
-    description="Upload a ZIP of images. Classifies images, analyzes dominant color, and detects/characterizes faces (age, gender, emotion).",
-)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import tempfile
+import datetime
+import time
 import torch
 import gradio as gr
+import spaces
+from transformers import pipeline
+from docx import Document
+from pydub import AudioSegment
+# --- Model definitions ---
+MODEL_SIZES = {
+    "Tiny (Fastest)": "openai/whisper-tiny",
+    "Base (Faster)": "openai/whisper-base",
+    "Small (Balanced)": "openai/whisper-small",
+    "Distil-Large-v3 (General Purpose)": "distil-whisper/distil-large-v3",
+    "Distil-Large-v3-FR (French-Specific)": "eustlb/distil-large-v3-fr"
 }
+# --- Caches ---
+model_cache = {}
+summary_cache = {}
+# --- Whisper pipeline loader ---
+def get_model_pipeline(model_name, progress):
+    if model_name not in model_cache:
+        progress(0, desc="🚀 Initializing ZeroGPU instance...")
+        model_id = MODEL_SIZES[model_name]
+        device = 0 if torch.cuda.is_available() else "cpu"
+        progress(0.1, desc=f"⏳ Loading {model_name} model...")
+        model_cache[model_name] = pipeline(
+            "automatic-speech-recognition",
+            model=model_id,
+            device=device
+        )
+        progress(0.5, desc="✅ Model loaded successfully!")
+    return model_cache[model_name]
+# --- French summarization pipeline ---
+def get_summary_pipeline():
+    if "summarizer" not in summary_cache:
+        summary_cache["summarizer"] = pipeline(
+            "summarization",
+            model="csebuetnlp/mT5_multilingual_XLSum"
+        )
+    return summary_cache["summarizer"]
+# --- Export functions ---
+def create_vtt(segments, file_path):
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write("WEBVTT\n\n")
+        for i, segment in enumerate(segments):
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            f.write(f"{i+1}\n{start} --> {end}\n{segment.get('text', '').strip()}\n\n")
+def create_docx(segments, file_path, with_timestamps):
+    document = Document()
+    document.add_heading("Transcription", 0)
+    if with_timestamps:
+        for segment in segments:
+            text = segment.get('text', '').strip()
+            start_seconds = segment.get('start', 0)
+            end_seconds = segment.get('end', 0)
+            start = str(datetime.timedelta(seconds=int(start_seconds)))
+            end = str(datetime.timedelta(seconds=int(end_seconds)))
+            document.add_paragraph(f"[{start} - {end}] {text}")
+    else:
+        full_text = " ".join([segment.get('text', '').strip() for segment in segments])
+        document.add_paragraph(full_text)
+    document.save(file_path)
+# --- Extract audio from video/audio ---
+def extract_audio_from_video(file_path):
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext in [".wav", ".mp3", ".m4a", ".flac"]:
+        return file_path  # Already audio
+    temp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    temp_audio.close()
+    audio = AudioSegment.from_file(file_path)
+    audio.export(temp_audio.name, format="wav")
+    return temp_audio.name
+# --- Main transcription function ---
+@spaces.GPU
+def transcribe_and_export(file, model_size, vtt_output, docx_timestamp_output, docx_no_timestamp_output, generate_summary, progress=gr.Progress()):
+    if file is None:
+        return None, None, None, None, "Please upload an audio or video file."
+    start_time = time.time()
+    audio_file_path = extract_audio_from_video(file)
+    # Transcription
+    pipe = get_model_pipeline(model_size, progress)
+    progress(0.75, desc="🎤 Transcribing audio...")
+    if model_size == "Distil-Large-v3-FR (French-Specific)":
+        raw_output = pipe(audio_file_path, return_timestamps=True, generate_kwargs={"language": "fr"})
+    else:
+        raw_output = pipe(audio_file_path, return_timestamps=True)
+    segments = raw_output.get("chunks", [])
+    outputs = {}
+    progress(0.85, desc="📝 Generating output files...")
+    if vtt_output:
+        vtt_path = "transcription.vtt"
+        create_vtt(segments, vtt_path)
+        outputs["VTT"] = vtt_path
+    if docx_timestamp_output:
+        docx_ts_path = "transcription_with_timestamps.docx"
+        create_docx(segments, docx_ts_path, with_timestamps=True)
+        outputs["DOCX (with timestamps)"] = docx_ts_path
+    if docx_no_timestamp_output:
+        docx_no_ts_path = "transcription_without_timestamps.docx"
+        create_docx(segments, docx_no_ts_path, with_timestamps=False)
+        outputs["DOCX (without timestamps)"] = docx_no_ts_path
+    transcribed_text = raw_output['text']
+    # Generate summary if requested
+    summary_text = None
+    if generate_summary:
+        progress(0.95, desc="📝 Generating summary...")
+        summarizer = get_summary_pipeline()
+        summary_output = summarizer(transcribed_text, max_length=150, min_length=30, do_sample=False)
+        summary_text = summary_output[0]['summary_text']
+    end_time = time.time()
+    total_time = end_time - start_time
+    downloadable_files = [path for path in outputs.values()]
+    status_message = f"✅ Transcription complete! Total time: {total_time:.2f} seconds."
+    return transcribed_text, gr.Files(value=downloadable_files, label="Download Transcripts"), audio_file_path, summary_text, status_message
+# --- Gradio UI ---
+with gr.Blocks(title="Whisper ZeroGPU Transcription") as demo:
+    gr.Markdown("# 🎙️ Whisper ZeroGPU Transcription")
+    gr.Markdown("Transcribe audio or video files with timestamps, and optionally generate a French summary.")
+    with gr.Row():
+        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio/Video File")
+        with gr.Column(scale=2):
+            model_selector = gr.Dropdown(
+                label="Choose Whisper Model Size",
+                choices=list(MODEL_SIZES.keys()),
+                value="Distil-Large-v3-FR (French-Specific)"
+            )
+            gr.Markdown("### Choose Output Formats")
+            with gr.Row():
+                vtt_checkbox = gr.Checkbox(label="VTT", value=True)
+                docx_ts_checkbox = gr.Checkbox(label="DOCX (with timestamps)", value=False)
+                docx_no_ts_checkbox = gr.Checkbox(label="DOCX (without timestamps)", value=True)
+            summary_checkbox = gr.Checkbox(label="Generate Summary", value=False)
+            transcribe_btn = gr.Button("Transcribe", variant="primary")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    transcription_output = gr.Textbox(label="Full Transcription", lines=10)
+    downloadable_files_output = gr.Files(label="Download Transcripts")
+    summary_output = gr.Textbox(label="Summary", lines=5)
+    transcribe_btn.click(
+        fn=transcribe_and_export,
+        inputs=[audio_input, model_selector, vtt_checkbox, docx_ts_checkbox, docx_no_ts_checkbox, summary_checkbox],
+        outputs=[transcription_output, downloadable_files_output, audio_input, summary_output, status_text]
+    )
 if __name__ == "__main__":
+    demo.launch()