Spaces:

Nikhil2411
/

textsumerizar1234

Sleeping

App Files Files Community

textsumerizar1234 / app.py

Nikhil2411

Update app.py

e43fe26 verified over 1 year ago

raw

history blame contribute delete

6.5 kB

	import gradio as gr
	import nltk
	import numpy as np
	import networkx as nx
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	import string
	from transformers import BartForConditionalGeneration, BartTokenizer
	import requests
	from PyPDF2 import PdfReader
	from bs4 import BeautifulSoup
	import fitz # PyMuPDF
	import docx
	from PIL import Image
	import pytesseract
	from youtube_transcript_api import YouTubeTranscriptApi

	# Download required NLTK data files
	nltk.download('all')

	# Load pre-trained BART model and tokenizer
	model_name = "facebook/bart-large-cnn"
	tokenizer = BartTokenizer.from_pretrained(model_name)
	model = BartForConditionalGeneration.from_pretrained(model_name)

	def preprocess_text(text):
	sentences = sent_tokenize(text)
	stop_words = set(stopwords.words('english'))
	preprocessed_sentences = []
	for sentence in sentences:
	words = word_tokenize(sentence.lower())
	filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
	preprocessed_sentences.append(' '.join(filtered_words))
	return sentences, preprocessed_sentences

	def build_similarity_matrix(sentences):
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
	similarity_matrix = cosine_similarity(tfidf_matrix)
	return similarity_matrix

	def textrank_summary(text, num_sentences=5):
	original_sentences, preprocessed_sentences = preprocess_text(text)
	similarity_matrix = build_similarity_matrix(preprocessed_sentences)
	similarity_graph = nx.from_numpy_array(similarity_matrix)
	scores = nx.pagerank(similarity_graph)
	ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
	summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
	return summary

	def tfidf_summary(text, num_sentences=5):
	original_sentences, preprocessed_sentences = preprocess_text(text)
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
	sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
	ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
	summary = ' '.join(ranked_sentences[:num_sentences])
	return summary

	def bart_summary(text):
	inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
	summary_ids = model.generate(
	inputs["input_ids"],
	max_length=1000,
	min_length=50,
	num_beams=4,
	early_stopping=True
	)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary

	def extract_text_from_url(url):
	try:
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')
	paragraphs = soup.find_all('p')
	text = ' '.join([para.get_text() for para in paragraphs])
	return text
	except Exception as e:
	return f"Error fetching link: {e}"

	def extract_text_from_pdf(pdf_path):
	try:
	document = fitz.open(pdf_path)
	text = ''
	for page in document:
	text += page.get_text()
	return text
	except Exception as e:
	return f"Error reading PDF: {e}"

	def extract_text_from_docx(docx_path):
	try:
	document = docx.Document(docx_path)
	text = ' '.join([para.text for para in document.paragraphs])
	return text
	except Exception as e:
	return f"Error reading DOCX: {e}"

	def extract_text_from_file(file):
	file_extension = file.name.split('.')[-1].lower()
	text = ''
	if file_extension == 'pdf':
	pdf_reader = PdfReader(file.name)
	for page in pdf_reader.pages:
	text += page.extract_text()
	elif file_extension == 'docx':
	doc = docx.Document(file.name)
	text = ' '.join([para.text for para in doc.paragraphs])
	elif file_extension in ('png', 'jpg', 'jpeg'):
	image = Image.open(file.name)
	text = pytesseract.image_to_string(image)
	elif file_extension == 'txt':
	with open(file.name, 'r', encoding='utf-8') as f:
	text = f.read()
	return text

	def extract_text_from_youtube(url):
	try:
	if "youtube.com" in url:
	video_id = url.split('v=')[1].split('&')[0]
	elif "youtu.be" in url:
	video_id = url.split('/')[-1]
	else:
	return "Invalid YouTube URL"

	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	text = ' '.join([item['text'] for item in transcript])
	return text
	except Exception as e:
	return f"Error fetching YouTube transcript: {e}"

	def summarize_text(text, file, link, youtube_link, method):
	input_text = ""
	if text:
	input_text = text
	elif file:
	input_text = extract_text_from_file(file)
	elif link:
	input_text = extract_text_from_url(link)
	elif youtube_link:
	input_text = extract_text_from_youtube(youtube_link)

	if "Error" in input_text:
	return input_text

	if method == "TF-IDF":
	return tfidf_summary(input_text)
	elif method == "TextRank":
	return textrank_summary(input_text)
	elif method == "Abstractive":
	return bart_summary(input_text)

	# Create a Gradio interface
	interface = gr.Interface(
	fn=summarize_text,
	inputs=[
	gr.Textbox(
	lines=8,
	placeholder="Paste your text here...",
	label="Input Text"
	),
	gr.File(
	label="Upload PDF, DOCX, JPG, PNG, JPEG, or Text Files"
	),
	gr.Textbox(
	lines=1,
	placeholder="Enter URL here...",
	label="Input Link"
	),
	gr.Textbox(
	lines=1,
	placeholder="Enter YouTube video URL here...",
	label="Input YouTube Link"
	),
	gr.Radio(
	choices=["TF-IDF", "TextRank", "Abstractive"],
	label="Summarization Method",
	value="Abstractive"
	)
	],
	outputs=gr.Textbox(
	lines=15,
	label="Concise Summary"
	),
	title="Text Summarizer",
	description="Get a clear and concise summary of your text!",
	theme="default",
	)

	# Launch the interface
	interface.launch(share=True)