import gradio as gr import joblib import PyPDF2 import nltk from collections import Counter nltk.download("punkt") nltk.download("punkt_tab") nltk.download("stopwords") from nltk.corpus import stopwords from nltk.tokenize import word_tokenize model = joblib.load("pdf_model.pkl") vectorizer = joblib.load("vectorizer.pkl") def extract_text(file): text = "" reader = PyPDF2.PdfReader(file) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text return text def extract_keywords(text): words = word_tokenize(text.lower()) filtered = [ w for w in words if w.isalpha() and w not in stopwords.words("english") ] counts = Counter(filtered) keywords = [w for w,_ in counts.most_common(5)] return keywords def summarize(text): sentences = text.split(".") return ".".join(sentences[:3]) def analyze_pdf(file): text = extract_text(file) keywords = extract_keywords(text) summary = summarize(text) X = vectorizer.transform([text]) pred = model.predict(X)[0] category = { 0: "Finance / Banking Document", 1: "Technology / Cloud / Machine Learning" } return f""" Category: {category[pred]} Keywords: {", ".join(keywords)} Summary: {summary} """ iface = gr.Interface( fn=analyze_pdf, inputs=gr.File(), outputs="text", title="AI PDF Analyzer", description="Upload a PDF to analyze its content, keywords and summary." ) iface.launch()