Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import nltk | |
| import numpy as np | |
| import networkx as nx | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import stopwords | |
| import string | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| import requests | |
| from PyPDF2 import PdfReader | |
| from bs4 import BeautifulSoup | |
| import fitz # PyMuPDF | |
| import docx | |
| from PIL import Image | |
| import pytesseract | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| # Download required NLTK data files | |
| nltk.download('all') | |
| # Load pre-trained BART model and tokenizer | |
| model_name = "facebook/bart-large-cnn" | |
| tokenizer = BartTokenizer.from_pretrained(model_name) | |
| model = BartForConditionalGeneration.from_pretrained(model_name) | |
| def preprocess_text(text): | |
| sentences = sent_tokenize(text) | |
| stop_words = set(stopwords.words('english')) | |
| preprocessed_sentences = [] | |
| for sentence in sentences: | |
| words = word_tokenize(sentence.lower()) | |
| filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation] | |
| preprocessed_sentences.append(' '.join(filtered_words)) | |
| return sentences, preprocessed_sentences | |
| def build_similarity_matrix(sentences): | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) | |
| similarity_matrix = cosine_similarity(tfidf_matrix) | |
| return similarity_matrix | |
| def textrank_summary(text, num_sentences=5): | |
| original_sentences, preprocessed_sentences = preprocess_text(text) | |
| similarity_matrix = build_similarity_matrix(preprocessed_sentences) | |
| similarity_graph = nx.from_numpy_array(similarity_matrix) | |
| scores = nx.pagerank(similarity_graph) | |
| ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True) | |
| summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]]) | |
| return summary | |
| def tfidf_summary(text, num_sentences=5): | |
| original_sentences, preprocessed_sentences = preprocess_text(text) | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences) | |
| sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten() | |
| ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]] | |
| summary = ' '.join(ranked_sentences[:num_sentences]) | |
| return summary | |
| def bart_summary(text): | |
| inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True) | |
| summary_ids = model.generate( | |
| inputs["input_ids"], | |
| max_length=1000, | |
| min_length=50, | |
| num_beams=4, | |
| early_stopping=True | |
| ) | |
| summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| return summary | |
| def extract_text_from_url(url): | |
| try: | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| paragraphs = soup.find_all('p') | |
| text = ' '.join([para.get_text() for para in paragraphs]) | |
| return text | |
| except Exception as e: | |
| return f"Error fetching link: {e}" | |
| def extract_text_from_pdf(pdf_path): | |
| try: | |
| document = fitz.open(pdf_path) | |
| text = '' | |
| for page in document: | |
| text += page.get_text() | |
| return text | |
| except Exception as e: | |
| return f"Error reading PDF: {e}" | |
| def extract_text_from_docx(docx_path): | |
| try: | |
| document = docx.Document(docx_path) | |
| text = ' '.join([para.text for para in document.paragraphs]) | |
| return text | |
| except Exception as e: | |
| return f"Error reading DOCX: {e}" | |
| def extract_text_from_file(file): | |
| file_extension = file.name.split('.')[-1].lower() | |
| text = '' | |
| if file_extension == 'pdf': | |
| pdf_reader = PdfReader(file.name) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| elif file_extension == 'docx': | |
| doc = docx.Document(file.name) | |
| text = ' '.join([para.text for para in doc.paragraphs]) | |
| elif file_extension in ('png', 'jpg', 'jpeg'): | |
| image = Image.open(file.name) | |
| text = pytesseract.image_to_string(image) | |
| elif file_extension == 'txt': | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| text = f.read() | |
| return text | |
| def extract_text_from_youtube(url): | |
| try: | |
| if "youtube.com" in url: | |
| video_id = url.split('v=')[1].split('&')[0] | |
| elif "youtu.be" in url: | |
| video_id = url.split('/')[-1] | |
| else: | |
| return "Invalid YouTube URL" | |
| transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
| text = ' '.join([item['text'] for item in transcript]) | |
| return text | |
| except Exception as e: | |
| return f"Error fetching YouTube transcript: {e}" | |
| def summarize_text(text, file, link, youtube_link, method): | |
| input_text = "" | |
| if text: | |
| input_text = text | |
| elif file: | |
| input_text = extract_text_from_file(file) | |
| elif link: | |
| input_text = extract_text_from_url(link) | |
| elif youtube_link: | |
| input_text = extract_text_from_youtube(youtube_link) | |
| if "Error" in input_text: | |
| return input_text | |
| if method == "TF-IDF": | |
| return tfidf_summary(input_text) | |
| elif method == "TextRank": | |
| return textrank_summary(input_text) | |
| elif method == "Abstractive": | |
| return bart_summary(input_text) | |
| # Create a Gradio interface | |
| interface = gr.Interface( | |
| fn=summarize_text, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=8, | |
| placeholder="Paste your text here...", | |
| label="Input Text" | |
| ), | |
| gr.File( | |
| label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files" | |
| ), | |
| gr.Textbox( | |
| lines=1, | |
| placeholder="Enter URL here...", | |
| label="Input Link" | |
| ), | |
| gr.Textbox( | |
| lines=1, | |
| placeholder="Enter YouTube video URL here...", | |
| label="Input YouTube Link" | |
| ), | |
| gr.Radio( | |
| choices=["TF-IDF", "TextRank", "Abstractive"], | |
| label="Summarization Method", | |
| value="Abstractive" | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| lines=15, | |
| label="Concise Summary" | |
| ), | |
| title="Text Summarizer", | |
| description="Get a clear and concise summary of your text!", | |
| theme="default", | |
| ) | |
| # Launch the interface | |
| interface.launch(share=True) | |