File size: 4,614 Bytes
16d1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Load pre-trained Pegasus model and tokenizer
model_name = "pegasus-fine_tuned_model"  # Example Pegasus model
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    # Remove stopwords and punctuation, and convert to lowercase
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_sentences.append(' '.join(filtered_words))
    return sentences, preprocessed_sentences

def build_similarity_matrix(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def textrank_summary(text, num_sentences=3):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
    return summary

def tfidf_summary(text, num_sentences=3):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
    ranked_sentences = [original_sentences[i] for i in np.argsort(sentence_scores, axis=0)[::-1]]
    summary = ' '.join(ranked_sentences[:num_sentences])
    return summary

def pegasus_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=250,
        min_length=30,  # Adjust max_length as needed
        num_beams=5,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def summarize_text(text, method):
    if method == "TF-IDF":
        return tfidf_summary(text)
    elif method == "TextRank":
        return textrank_summary(text)
    elif method == "Abstractive":
        return pegasus_summary(text)

# Custom CSS for styling
custom_css = """
.gr-box {
    border-radius: 10px;
    padding: 20px;
    box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
    margin: 20px 0;
    background-color: #fff;
}

.gr-input, .gr-output {
    border: 1px solid #ccc;
    border-radius: 5px;
    padding: 10px;
    font-size: 16px;
}

.gr-button {
    background-color: #007bff;
    color: white;
    padding: 10px 20px;
    border: none;
    border-radius: 5px;
    font-size: 16px;
    cursor: pointer;
    transition: background-color 0.3s;
}

.gr-button:hover {
    background-color: #0056b3;
}
"""

# Create a visually appealing Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(
            lines=30,
            placeholder="Paste your text here...",
            label="Input Text",
            elem_classes="gr-input"  # Apply custom CSS class
        ),
        gr.Radio(
            choices=["TF-IDF", "TextRank", "Abstractive"],
            label="Summarization Method",
            value="Abstractive"
        )
    ],
    outputs=gr.Textbox(
        lines=30,
        label="Concise Summary",
        elem_classes="gr-output"  # Apply custom CSS class
    ),
    title="Pegasus Text Summarizer",
    description="Get a clear and concise summary of your text in seconds!",
    theme="default",  # Use a built-in theme
    css=custom_css  # Add custom CSS
)

# Launch the interface
interface.launch(
    share=True,
    debug=True  # Enable debug mode for error handling (optional)
)