File size: 1,634 Bytes
e54354a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import gradio as gr
import joblib
import PyPDF2
import nltk
from collections import Counter
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
model = joblib.load("pdf_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")
def extract_text(file):
text = ""
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def extract_keywords(text):
words = word_tokenize(text.lower())
filtered = [
w for w in words
if w.isalpha() and w not in stopwords.words("english")
]
counts = Counter(filtered)
keywords = [w for w,_ in counts.most_common(5)]
return keywords
def summarize(text):
sentences = text.split(".")
return ".".join(sentences[:3])
def analyze_pdf(file):
text = extract_text(file)
keywords = extract_keywords(text)
summary = summarize(text)
X = vectorizer.transform([text])
pred = model.predict(X)[0]
category = {
0: "Finance / Banking Document",
1: "Technology / Cloud / Machine Learning"
}
return f"""
Category: {category[pred]}
Keywords: {", ".join(keywords)}
Summary:
{summary}
"""
iface = gr.Interface(
fn=analyze_pdf,
inputs=gr.File(),
outputs="text",
title="AI PDF Analyzer",
description="Upload a PDF to analyze its content, keywords and summary."
)
iface.launch() |