Spaces:

karthikmn
/

smart-notes

Runtime error

App Files Files Community

karthikmn commited on Jun 7, 2025

Commit

8a0fbba

verified ·

1 Parent(s): 09d9f61

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -93

app.py CHANGED Viewed

@@ -1,98 +1,37 @@
 import gradio as gr
-import os
-import tempfile
-import speech_recognition as sr
-from moviepy.editor import VideoFileClip
-import cv2
-from PIL import Image
-import pytesseract
-import nltk
-from transformers import pipeline
-# Download NLP models
-nltk.download("punkt")
-summarizer = pipeline("summarization")
-# Audio Transcription
-def transcribe_audio(audio_path):
-    recognizer = sr.Recognizer()
-    with sr.AudioFile(audio_path) as source:
-        audio = recognizer.record(source)
-    return recognizer.recognize_google(audio)
-# Extract audio from video
-def extract_audio(video_path):
-    video = VideoFileClip(video_path)
-    audio_path = "temp_audio.wav"
-    video.audio.write_audiofile(audio_path)
-    return audio_path
-# Extract key frames from video
-def extract_frames(video_path, interval=90):  # 3 seconds if ~30fps
-    vidcap = cv2.VideoCapture(video_path)
-    success, image = vidcap.read()
-    count = 0
-    frames = []
-    while success:
-        if count % interval == 0:
-            filename = f"frame_{count}.jpg"
-            cv2.imwrite(filename, image)
-            frames.append(filename)
-        success, image = vidcap.read()
-        count += 1
-    return frames[:3]  # return top 3
-# OCR on images
-def ocr_text_from_frames(frame_paths):
-    texts = []
-    for frame in frame_paths:
-        img = Image.open(frame)
-        text = pytesseract.image_to_string(img)
-        texts.append(text)
-    return "\n".join(texts)
-# Summarize long text
-def summarize_text(text):
-    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-    summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
-    return "\n".join(summaries)
-# Core function
-def process_lecture(file):
-    suffix = os.path.splitext(file.name)[-1]
-    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-        tmp.write(file.read())
-        input_path = tmp.name
-    if suffix in [".mp4", ".mkv", ".avi"]:
-        audio_path = extract_audio(input_path)
-        frames = extract_frames(input_path)
-        slide_text = ocr_text_from_frames(frames)
-    else:
-        audio_path = input_path
-        slide_text = ""
-    try:
-        transcript = transcribe_audio(audio_path)
-    except Exception as e:
-        transcript = f"[Error during transcription: {e}]"
-    full_text = transcript + "\n" + slide_text
-    summary = summarize_text(full_text) if full_text.strip() else "No content to summarize."
-    return transcript, slide_text, summary
-# Launch Gradio Interface
 iface = gr.Interface(
-    fn=process_lecture,
-    inputs=gr.File(label="Upload Lecture Audio or Video"),
-    outputs=[
-        gr.Textbox(label="🎤 Transcript"),
-        gr.Textbox(label="🖼 Slide OCR Text"),
-        gr.Textbox(label="📝 Summary Notes")
-    ],
-    title="Smart Lecture Notes Generator",
-    description="Upload a lecture recording (audio or video). It will transcribe speech, extract slide text via OCR, and generate summarized notes."
 )
 iface.launch()

 import gradio as gr
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import torch
+# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
+# Function to convert speech to text
+def speech_to_text(audio_file):
+    # Load the audio file
+    audio_input, _ = torchaudio.load(audio_file.name)
+    # Preprocess the audio input (e.g., resample, normalize, etc.)
+    input_values = processor(audio_input, return_tensors="pt").input_values
+    # Perform speech-to-text (CTC Decoding)
+    with torch.no_grad():
+        logits = model(input_values).logits
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Decode the predicted ids to text
+    transcription = processor.decode(predicted_ids[0])
+    return transcription
+# Set up the Gradio interface
 iface = gr.Interface(
+    fn=speech_to_text,  # Function to be executed
+    inputs=gr.Audio(source="upload", type="file"),  # Allow audio file upload
+    outputs=gr.Textbox(),  # Display transcription in a text box
+    title="Speech-to-Text Analyzer for Lecture Notes",
+    description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
 )
+# Launch the interface
 iface.launch()