File size: 3,014 Bytes
6d5e888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import os
import tempfile
import speech_recognition as sr
from moviepy.editor import VideoFileClip
import cv2
from PIL import Image
import pytesseract
import nltk
from transformers import pipeline

# Download NLP models
nltk.download("punkt")
summarizer = pipeline("summarization")

# Audio Transcription
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    return recognizer.recognize_google(audio)

# Extract audio from video
def extract_audio(video_path):
    video = VideoFileClip(video_path)
    audio_path = "temp_audio.wav"
    video.audio.write_audiofile(audio_path)
    return audio_path

# Extract key frames from video
def extract_frames(video_path, interval=90):  # 3 seconds if ~30fps
    vidcap = cv2.VideoCapture(video_path)
    success, image = vidcap.read()
    count = 0
    frames = []
    while success:
        if count % interval == 0:
            filename = f"frame_{count}.jpg"
            cv2.imwrite(filename, image)
            frames.append(filename)
        success, image = vidcap.read()
        count += 1
    return frames[:3]  # return top 3

# OCR on images
def ocr_text_from_frames(frame_paths):
    texts = []
    for frame in frame_paths:
        img = Image.open(frame)
        text = pytesseract.image_to_string(img)
        texts.append(text)
    return "\n".join(texts)

# Summarize long text
def summarize_text(text):
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
    return "\n".join(summaries)

# Core function
def process_lecture(file):
    suffix = os.path.splitext(file.name)[-1]
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(file.read())
        input_path = tmp.name

    if suffix in [".mp4", ".mkv", ".avi"]:
        audio_path = extract_audio(input_path)
        frames = extract_frames(input_path)
        slide_text = ocr_text_from_frames(frames)
    else:
        audio_path = input_path
        slide_text = ""

    try:
        transcript = transcribe_audio(audio_path)
    except Exception as e:
        transcript = f"[Error during transcription: {e}]"

    full_text = transcript + "\n" + slide_text
    summary = summarize_text(full_text) if full_text.strip() else "No content to summarize."

    return transcript, slide_text, summary

# Launch Gradio Interface
iface = gr.Interface(
    fn=process_lecture,
    inputs=gr.File(label="Upload Lecture Audio or Video"),
    outputs=[
        gr.Textbox(label="🎀 Transcript"),
        gr.Textbox(label="πŸ–Ό Slide OCR Text"),
        gr.Textbox(label="πŸ“ Summary Notes")
    ],
    title="Smart Lecture Notes Generator",
    description="Upload a lecture recording (audio or video). It will transcribe speech, extract slide text via OCR, and generate summarized notes."
)

iface.launch()