File size: 1,114 Bytes
484f683
09aef7c
f9db6c6
745c3e0
f147862
da0397f
 
c2b73fe
 
feb7a08
8d79195
 
484f683
745c3e0
0eb8dff
c2b73fe
 
 
09aef7c
 
f147862
09aef7c
 
 
 
 
c2b73fe
f147862
09aef7c
f147862
c2b73fe
 
09aef7c
 
feb7a08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import gradio as gr
from transformers import pipeline, AutoTokenizer
import logging
import traceback
import sys
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
from gtts import gTTS
import io
import base64

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the summarizer with T5-small model
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def fetch_content_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs])
        return content[:10000]  # Limit to first 10000 characters
    except Exception as e:
        return f"Error fetching content: {str(e)}"

def chunk_text(text, max_chunk_size=200):
    words = text.split()[:1000]  # Limit to first 1000 words
    chunks = []
    current_chunk = []
    current_size