PowerCompute750's picture
Added initial files including models and runtime
e54354a verified
import gradio as gr
import joblib
import PyPDF2
import nltk
from collections import Counter
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
model = joblib.load("pdf_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")
def extract_text(file):
text = ""
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def extract_keywords(text):
words = word_tokenize(text.lower())
filtered = [
w for w in words
if w.isalpha() and w not in stopwords.words("english")
]
counts = Counter(filtered)
keywords = [w for w,_ in counts.most_common(5)]
return keywords
def summarize(text):
sentences = text.split(".")
return ".".join(sentences[:3])
def analyze_pdf(file):
text = extract_text(file)
keywords = extract_keywords(text)
summary = summarize(text)
X = vectorizer.transform([text])
pred = model.predict(X)[0]
category = {
0: "Finance / Banking Document",
1: "Technology / Cloud / Machine Learning"
}
return f"""
Category: {category[pred]}
Keywords: {", ".join(keywords)}
Summary:
{summary}
"""
iface = gr.Interface(
fn=analyze_pdf,
inputs=gr.File(),
outputs="text",
title="AI PDF Analyzer",
description="Upload a PDF to analyze its content, keywords and summary."
)
iface.launch()