Spaces:

karthikmn
/

audio.py

Build error

App Files Files Community

karthikmn commited on Jun 7, 2025

Commit

61324e8

verified ·

1 Parent(s): da1b807

Create app.py

Browse files

Files changed (1) hide show

app.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import gradio as gr
+import os
+import tempfile
+import speech_recognition as sr
+import nltk
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+from moviepy.editor import VideoFileClip
+from pytesseract import image_to_string
+from PIL import Image
+import cv2
+from transformers import pipeline
+import concurrent.futures
+# Downloads
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+# Use faster summarization model
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+# Functions
+def extract_audio(video_path):
+    video = VideoFileClip(video_path)
+    audio_path = "extracted_audio.wav"
+    video.audio.write_audiofile(audio_path, verbose=False, logger=None)
+    return audio_path
+def transcribe_audio(audio_path):
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(audio_path) as source:
+        audio = recognizer.record(source, duration=30)  # limit to 30s
+    return recognizer.recognize_google(audio)
+def extract_keywords(text):
+    tokens = word_tokenize(text)
+    pos_tags = nltk.pos_tag(tokens)
+    stemmer = PorterStemmer()
+    return list(set(f"{stemmer.stem(w.lower())} ({t})" for w, t in pos_tags if t.startswith("NN") or t.startswith("VB")))
+def summarize_text(text, ratio="short"):
+    max_len, min_len = (100, 30) if ratio == "short" else (150, 50) if ratio == "medium" else (250, 80)
+    if len(text.split()) < min_len:
+        return "Transcript is too short to summarize."
+    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+    summary = ""
+    for chunk in chunks:
+        sum_out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
+        summary += sum_out[0]['summary_text'] + " "
+    return summary.strip()
+def extract_slide_text(video_path):
+    cap = cv2.VideoCapture(video_path)
+    frame_count = 0
+    ocr_texts = set()
+    while cap.isOpened() and frame_count < 20:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % 30 == 0:
+            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            text = image_to_string(image)
+            if text.strip():
+                ocr_texts.add(text.strip())
+        frame_count += 1
+    cap.release()
+    return "\n\n".join(ocr_texts)
+# Gradio UI
+def process_file(uploaded_file):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as temp_file:
+        temp_file.write(uploaded_file.read())
+        file_path = temp_file.name
+    audio_path = file_path
+    slide_text = ""
+    try:
+        if file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv")):
+            audio_path = extract_audio(file_path)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            # Running OCR and transcription in parallel
+            ocr_future = executor.submit(extract_slide_text, file_path) if file_path.endswith((".mp4", ".mov", ".avi", ".mkv")) else None
+            trans_future = executor.submit(transcribe_audio, audio_path)
+            transcript = trans_future.result()
+            slide_text = ocr_future.result() if ocr_future else ""
+        results = {}
+        if slide_text:
+            results["slide_text"] = slide_text
+        results["transcript"] = transcript
+        results["keywords"] = extract_keywords(transcript)
+        summary_mode = "short"
+        results["summary"] = summarize_text(transcript, ratio=summary_mode)
+        os.remove(file_path)
+        if audio_path != file_path and os.path.exists(audio_path):
+            os.remove(audio_path)
+        return results
+# Gradio Interface
+inputs = gr.File(label="Upload Audio/Video File (Any Format)", type="file")
+outputs = [
+    gr.Textbox(label="Full Transcription", lines=10),
+    gr.Textbox(label="Keywords", lines=2),
+    gr.Textbox(label="Lecture Summary", lines=10),
+    gr.Textbox(label="Slide/Whiteboard Text", lines=10)
+]
+gr.Interface(fn=process_file, inputs=inputs, outputs=outputs, live=True).launch()