import spacy.cli spacy.cli.download("en_core_web_sm") from huggingface_hub import login import os import PyPDF2 import spacy import nltk from transformers import pipeline import whisper import json from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import pandas as pd import re from textblob import TextBlob from spacy import displacy import gradio as gr from huggingface_hub import login # Retrieve the Hugging Face token from secrets token = os.getenv("Gemma") if not token: raise ValueError("Token not found. Ensure it is set correctly in Secrets.") # Authenticate with the Hugging Face Hub login(token=token) # Load the summarization pipeline summarizer = pipeline( "summarization", model="google/gemma-2-2b-it", use_auth_token=token ) # Initialize spaCy globally nlp = spacy.load("en_core_web_sm") # Text preprocessing def preprocess_text(text): doc = nlp(text) tokens = [token.text for token in doc if not token.is_stop and not token.is_punct] cleaned_text = " ".join(tokens) return cleaned_text # Text summarization def summarize_text(text): summary = summarizer(text, max_length=400, min_length=50, do_sample=False) return summary[0]['summary_text'] # Sentiment analysis def sentiment_analysis(text): blob = TextBlob(text) sentiment = blob.sentiment.polarity if sentiment > 0: return "Positive" elif sentiment < 0: return "Negative" else: return "Neutral" # Keyword extraction def extract_keywords(text): vectorizer = TfidfVectorizer(stop_words='english') tfidf_matrix = vectorizer.fit_transform([text]) feature_names = np.array(vectorizer.get_feature_names_out()) sorted_idx = tfidf_matrix.sum(axis=0).argsort()[::-1] top_keywords = feature_names[sorted_idx[:10]] return top_keywords.tolist() # Decision/action item extraction def extract_decisions(text): doc = nlp(text) decisions = [] for sent in doc.sents: for token in sent: if token.dep_ == "ROOT" and token.pos_ == "VERB": decisions.append(sent.text) return decisions # Backend function to handle uploaded file def handle_file_upload(uploaded_file): if uploaded_file: # Extract text from the PDF pdf_reader = PyPDF2.PdfReader(uploaded_file) text = "" for page in pdf_reader.pages: text += page.extract_text() # Preprocess text cleaned_text = preprocess_text(text) # Summarize text summary = summarize_text(cleaned_text) # Sentiment analysis sentiment = sentiment_analysis(text) # Extract Keywords keywords = extract_keywords(text) # Extract decisions/action items decisions = extract_decisions(text) return { 'summary': summary, 'sentiment': sentiment, 'keywords': keywords, 'decisions': decisions } else: return None # Gradio Interface def process_file(file): if file is not None: results = handle_file_upload(file) if results: return ( results['summary'], results['sentiment'], ", ".join(map(str, results['keywords'])), "\n".join(results['decisions']) ) return "No file uploaded!", "N/A", "N/A", "N/A" # Define Gradio interface interface = gr.Interface( fn=process_file, inputs=gr.File(label="Upload a PDF File"), outputs=[ gr.Textbox(label="Summary"), gr.Textbox(label="Sentiment Analysis"), gr.Textbox(label="Keywords"), gr.Textbox(label="Decisions/Action Items") ], title="Smart Meeting Summarizer", description="Upload your meeting notes or PDF file to get a summary, sentiment analysis, keywords, and decisions/action items." ) # Launch the Gradio app interface.launch(debug=True, share=True)