Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| from concurrent.futures import ThreadPoolExecutor | |
| # Load model and tokenizer | |
| model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') | |
| tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') | |
| def chunk_text(text, chunk_size=1024): | |
| """Break text into chunks of a specified size.""" | |
| tokens = tokenizer.encode(text, truncation=False) | |
| chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)] | |
| return chunks | |
| def summarize_chunk(chunk, summary_max_length=150): | |
| """Summarize a single chunk.""" | |
| inputs = tokenizer.decode(chunk, skip_special_tokens=True) | |
| inputs = tokenizer([inputs], max_length=1024, return_tensors='pt', truncation=True) | |
| summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=summary_max_length, early_stopping=True) | |
| return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| def summarize_chunks_parallel(chunks, summary_max_length=150): | |
| """Summarize each chunk in parallel and combine the summaries.""" | |
| with ThreadPoolExecutor() as executor: | |
| summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, summary_max_length), chunks)) | |
| return ' '.join(summaries) | |
| def summarize_text(text, title=None, author=None, length_ratio=0.25): | |
| # Dynamically adjust chunk size based on text length | |
| input_length = len(tokenizer.encode(text, truncation=True)) | |
| chunk_size = min(1024, max(512, input_length // 8)) | |
| # Break text into chunks | |
| chunks = chunk_text(text, chunk_size=chunk_size) | |
| # Set the max length for each summary based on the length ratio | |
| summary_max_length = int(len(chunks) * length_ratio * 1024) | |
| # Summarize each chunk in parallel and combine the summaries | |
| summary = summarize_chunks_parallel(chunks, summary_max_length=summary_max_length) | |
| # Adding introductory sentence if title or author is available | |
| if title or author: | |
| intro = f"The text titled '{title}'" if title else "The text" | |
| if author: | |
| intro += f" by {author}" | |
| intro += " discusses the following main points: " | |
| summary = intro + summary | |
| return summary | |
| def extract_text_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| paragraphs = soup.find_all('p') | |
| text = ' '.join([para.get_text() for para in paragraphs]) | |
| return text | |
| except Exception as e: | |
| return str(e) | |
| def extract_text_from_pdf(file): | |
| pdf_text = "" | |
| try: | |
| reader = PdfReader(file) | |
| for page in reader.pages: | |
| pdf_text += page.extract_text() | |
| return pdf_text | |
| except Exception as e: | |
| return str(e) | |
| def extract_text_from_docx(file): | |
| doc_text = "" | |
| try: | |
| doc = Document(file) | |
| for para in doc.paragraphs: | |
| doc_text += para.text + "\n" | |
| return doc_text | |
| except Exception as e: | |
| return str(e) | |
| def process_input(text=None, url=None, file=None, length_ratio=0.25): | |
| if text: | |
| # Summarize the provided text | |
| return summarize_text(text, length_ratio=length_ratio) | |
| elif url: | |
| # Extract text from the provided URL and summarize it | |
| text = extract_text_from_url(url) | |
| if text: | |
| return summarize_text(text, length_ratio=length_ratio) | |
| else: | |
| return "No text extracted from the URL." | |
| elif file: | |
| # Extract text from the provided file (PDF or DOCX) and summarize it | |
| if file.name.endswith('.pdf'): | |
| text = extract_text_from_pdf(file) | |
| elif file.name.endswith('.docx'): | |
| text = extract_text_from_docx(file) | |
| else: | |
| return "Unsupported file type. Please upload a PDF or DOCX file." | |
| if text: | |
| return summarize_text(text, length_ratio=length_ratio) | |
| else: | |
| return "No text extracted from the file." | |
| else: | |
| return "Please provide text, a URL, or upload a file." | |
| # Define Gradio interface | |
| interface = gr.Interface( | |
| fn=process_input, | |
| inputs=[ | |
| gr.Textbox(label="Input Text", placeholder="Enter text here...", lines=10), # Adjusted input field size | |
| gr.Textbox(label="URL", placeholder="Enter URL here...", lines=2), # Adjusted URL field size | |
| gr.File(label="Upload a file (PDF or DOCX)"), | |
| gr.Slider(label="Summary Length Ratio (as a fraction of the original)", minimum=0.1, maximum=1.0, step=0.05, value=0.25) | |
| ], | |
| outputs=gr.Textbox(label="Summary", lines=20), # Adjusted output field size | |
| title="Text Summarization Tool", | |
| description="Enter text, paste a URL, or upload a PDF/DOCX file to generate a summary. Adjust the summary length with the slider." | |
| ) | |
| interface.launch() | |