File size: 4,392 Bytes
33c334b
 
6abc153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8029d94
d53a26f
8029d94
 
 
 
d53a26f
8029d94
 
6abc153
8029d94
29b5526
8029d94
 
 
29b5526
6abc153
de94233
 
 
6abc153
 
 
 
 
 
c30c941
 
6abc153
b70bff7
 
 
 
 
 
 
c30c941
b70bff7
6abc153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5252d6f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import spacy.cli
spacy.cli.download("en_core_web_sm")
from huggingface_hub import login
import os
import PyPDF2
import spacy
import nltk
from transformers import pipeline
import whisper
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
from textblob import TextBlob
from spacy import displacy
import gradio as gr
from huggingface_hub import login

# Retrieve the Hugging Face token from secrets
token = os.getenv("Gemma")
if not token:
    raise ValueError("Token not found. Ensure it is set correctly in Secrets.")

# Authenticate with the Hugging Face Hub
login(token=token)

# Load the summarization pipeline
summarizer = pipeline(
    "summarization", 
    model="google/gemma-2-2b-it", 
    use_auth_token=token
)

# Initialize spaCy globally
nlp = spacy.load("en_core_web_sm")

# Text preprocessing
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    cleaned_text = " ".join(tokens)
    return cleaned_text
    
#Text summarization
def summarize_text(text):
    if not text.strip():  # Check for empty text
        raise ValueError("Input text is empty or invalid.")

    input_length = len(text.split())  # Approximate input length in words
    max_len = min(400, int(0.8 * input_length))  # Limit max_length dynamically
    print(f"Input length: {input_length}, max_length: {max_len}")  # Debugging info

    # Summarization with the pipeline (without use_auth_token)
    summary = summarizer(text, max_length=max_len, min_length=50, do_sample=False)  # No use_auth_token
    return summary[0]['summary_text']

# Sentiment analysis
def sentiment_analysis(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return "Positive"
    elif sentiment < 0:
        return "Negative"
    else:
        return "Neutral"

# Keyword extraction
def extract_keywords(text):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = np.array(vectorizer.get_feature_names_out())
    sorted_idx = tfidf_matrix.sum(axis=0).argsort()[::-1]
    top_keywords = feature_names[sorted_idx[:10]]
    return top_keywords.tolist()

# Decision/action item extraction
def extract_decisions(text):
    doc = nlp(text)
    decisions = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "ROOT" and token.pos_ == "VERB":
                decisions.append(sent.text)
    return decisions


# Backend function to handle uploaded file
def handle_file_upload(uploaded_file):
    if uploaded_file:
        # Extract text from the PDF
        pdf_reader = PyPDF2.PdfReader(uploaded_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()

        # Preprocess text
        cleaned_text = preprocess_text(text)

        # Summarize text
        summary = summarize_text(cleaned_text)

        # Sentiment analysis
        sentiment = sentiment_analysis(text)

        # Extract Keywords
        keywords = extract_keywords(text)

        # Extract decisions/action items
        decisions = extract_decisions(text)

        return {
            'summary': summary,
            'sentiment': sentiment,
            'keywords': keywords,
            'decisions': decisions
        }
    else:
        return None


# Gradio Interface
def process_file(file):
    if file is not None:
        results = handle_file_upload(file)
        if results:
            return (
                results['summary'], 
                results['sentiment'], 
                ", ".join(map(str, results['keywords'])), 
                "\n".join(results['decisions'])
            )
    return "No file uploaded!", "N/A", "N/A", "N/A"

# Define Gradio interface
interface = gr.Interface(
    fn=process_file,
    inputs=gr.File(label="Upload a PDF File"),
    outputs=[
        gr.Textbox(label="Summary"),
        gr.Textbox(label="Sentiment Analysis"),
        gr.Textbox(label="Keywords"),
        gr.Textbox(label="Decisions/Action Items")
    ],
    title="Smart Meeting Summarizer",
    description="Upload your meeting notes or PDF file to get a summary, sentiment analysis, keywords, and decisions/action items."
)

# Launch the Gradio app
interface.launch(debug=True, share=True)