| import gradio as gr | |
| import joblib | |
| import PyPDF2 | |
| import nltk | |
| from collections import Counter | |
| nltk.download("punkt") | |
| nltk.download("punkt_tab") | |
| nltk.download("stopwords") | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| model = joblib.load("pdf_model.pkl") | |
| vectorizer = joblib.load("vectorizer.pkl") | |
| def extract_text(file): | |
| text = "" | |
| reader = PyPDF2.PdfReader(file) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text | |
| return text | |
| def extract_keywords(text): | |
| words = word_tokenize(text.lower()) | |
| filtered = [ | |
| w for w in words | |
| if w.isalpha() and w not in stopwords.words("english") | |
| ] | |
| counts = Counter(filtered) | |
| keywords = [w for w,_ in counts.most_common(5)] | |
| return keywords | |
| def summarize(text): | |
| sentences = text.split(".") | |
| return ".".join(sentences[:3]) | |
| def analyze_pdf(file): | |
| text = extract_text(file) | |
| keywords = extract_keywords(text) | |
| summary = summarize(text) | |
| X = vectorizer.transform([text]) | |
| pred = model.predict(X)[0] | |
| category = { | |
| 0: "Finance / Banking Document", | |
| 1: "Technology / Cloud / Machine Learning" | |
| } | |
| return f""" | |
| Category: {category[pred]} | |
| Keywords: {", ".join(keywords)} | |
| Summary: | |
| {summary} | |
| """ | |
| iface = gr.Interface( | |
| fn=analyze_pdf, | |
| inputs=gr.File(), | |
| outputs="text", | |
| title="AI PDF Analyzer", | |
| description="Upload a PDF to analyze its content, keywords and summary." | |
| ) | |
| iface.launch() |