Spaces:
Sleeping
Sleeping
File size: 4,614 Bytes
16d1baf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
# Load pre-trained Pegasus model and tokenizer
model_name = "pegasus-fine_tuned_model" # Example Pegasus model
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
def preprocess_text(text):
# Tokenize text into sentences
sentences = sent_tokenize(text)
# Remove stopwords and punctuation, and convert to lowercase
stop_words = set(stopwords.words('english'))
preprocessed_sentences = []
for sentence in sentences:
words = word_tokenize(sentence.lower())
filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
preprocessed_sentences.append(' '.join(filtered_words))
return sentences, preprocessed_sentences
def build_similarity_matrix(sentences):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
def textrank_summary(text, num_sentences=3):
original_sentences, preprocessed_sentences = preprocess_text(text)
similarity_matrix = build_similarity_matrix(preprocessed_sentences)
similarity_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(similarity_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
return summary
def tfidf_summary(text, num_sentences=3):
original_sentences, preprocessed_sentences = preprocess_text(text)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
summary = ' '.join(ranked_sentences[:num_sentences])
return summary
def pegasus_summary(text):
inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=250,
min_length=30, # Adjust max_length as needed
num_beams=5,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def summarize_text(text, method):
if method == "TF-IDF":
return tfidf_summary(text)
elif method == "TextRank":
return textrank_summary(text)
elif method == "Abstractive":
return pegasus_summary(text)
# Custom CSS for styling
custom_css = """
.gr-box {
border-radius: 10px;
padding: 20px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
margin: 20px 0;
background-color: #fff;
}
.gr-input, .gr-output {
border: 1px solid #ccc;
border-radius: 5px;
padding: 10px;
font-size: 16px;
}
.gr-button {
background-color: #007bff;
color: white;
padding: 10px 20px;
border: none;
border-radius: 5px;
font-size: 16px;
cursor: pointer;
transition: background-color 0.3s;
}
.gr-button:hover {
background-color: #0056b3;
}
"""
# Create a visually appealing Gradio interface
interface = gr.Interface(
fn=summarize_text,
inputs=[
gr.Textbox(
lines=30,
placeholder="Paste your text here...",
label="Input Text",
elem_classes="gr-input" # Apply custom CSS class
),
gr.Radio(
choices=["TF-IDF", "TextRank", "Abstractive"],
label="Summarization Method",
value="Abstractive"
)
],
outputs=gr.Textbox(
lines=30,
label="Concise Summary",
elem_classes="gr-output" # Apply custom CSS class
),
title="Pegasus Text Summarizer",
description="Get a clear and concise summary of your text in seconds!",
theme="default", # Use a built-in theme
css=custom_css # Add custom CSS
)
# Launch the interface
interface.launch(
share=True,
debug=True # Enable debug mode for error handling (optional)
)
|