|
|
import gradio as gr |
|
|
from transformers import pipeline, AutoTokenizer |
|
|
import logging |
|
|
import traceback |
|
|
import sys |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import concurrent.futures |
|
|
import time |
|
|
from gtts import gTTS |
|
|
import io |
|
|
import base64 |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
|
|
|
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small") |
|
|
tokenizer = AutoTokenizer.from_pretrained("t5-small") |
|
|
|
|
|
def fetch_content_from_url(url): |
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
paragraphs = soup.find_all('p') |
|
|
content = ' '.join([p.get_text() for p in paragraphs]) |
|
|
return content[:10000] |
|
|
except Exception as e: |
|
|
return f"Error fetching content: {str(e)}" |
|
|
|
|
|
def chunk_text(text, max_chunk_size=200): |
|
|
words = text.split()[:1000] |
|
|
chunks = [] |
|
|
current_chunk = [] |
|
|
current_size |
|
|
|