PowerCompute750 commited on
Commit
e54354a
·
verified ·
1 Parent(s): 00fdbe9

Added initial files including models and runtime

Browse files
Files changed (4) hide show
  1. app.py +89 -0
  2. pdf_model.pkl +3 -0
  3. requirements.txt +0 -0
  4. vectorizer.pkl +3 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import PyPDF2
4
+ import nltk
5
+ from collections import Counter
6
+
7
+ nltk.download("punkt")
8
+ nltk.download("punkt_tab")
9
+ nltk.download("stopwords")
10
+
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import word_tokenize
13
+
14
+ model = joblib.load("pdf_model.pkl")
15
+ vectorizer = joblib.load("vectorizer.pkl")
16
+
17
+
18
+ def extract_text(file):
19
+
20
+ text = ""
21
+
22
+ reader = PyPDF2.PdfReader(file)
23
+
24
+ for page in reader.pages:
25
+ page_text = page.extract_text()
26
+ if page_text:
27
+ text += page_text
28
+
29
+ return text
30
+
31
+
32
+ def extract_keywords(text):
33
+
34
+ words = word_tokenize(text.lower())
35
+
36
+ filtered = [
37
+ w for w in words
38
+ if w.isalpha() and w not in stopwords.words("english")
39
+ ]
40
+
41
+ counts = Counter(filtered)
42
+
43
+ keywords = [w for w,_ in counts.most_common(5)]
44
+
45
+ return keywords
46
+
47
+
48
+ def summarize(text):
49
+
50
+ sentences = text.split(".")
51
+ return ".".join(sentences[:3])
52
+
53
+
54
+ def analyze_pdf(file):
55
+
56
+ text = extract_text(file)
57
+
58
+ keywords = extract_keywords(text)
59
+
60
+ summary = summarize(text)
61
+
62
+ X = vectorizer.transform([text])
63
+
64
+ pred = model.predict(X)[0]
65
+
66
+ category = {
67
+ 0: "Finance / Banking Document",
68
+ 1: "Technology / Cloud / Machine Learning"
69
+ }
70
+
71
+ return f"""
72
+ Category: {category[pred]}
73
+
74
+ Keywords: {", ".join(keywords)}
75
+
76
+ Summary:
77
+ {summary}
78
+ """
79
+
80
+
81
+ iface = gr.Interface(
82
+ fn=analyze_pdf,
83
+ inputs=gr.File(),
84
+ outputs="text",
85
+ title="AI PDF Analyzer",
86
+ description="Upload a PDF to analyze its content, keywords and summary."
87
+ )
88
+
89
+ iface.launch()
pdf_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a7f7ef6a86fc591f4552b7c665519fb134f573253c87d299e6da40ca8a6335
3
+ size 991
requirements.txt ADDED
Binary file (102 Bytes). View file
 
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4bdc3a0a3119f554f85204bc0b07cecb902963980b428f35b3f77f7affdf4f
3
+ size 1178