Fluospark128 commited on
Commit
a42faad
·
verified ·
1 Parent(s): 028e95f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from transformers import pipeline
4
+
5
+ # Zero-shot classification pipeline
6
+ @st.cache_resource
7
+ def load_classifier():
8
+     return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
9
+
10
+ # Function to extract text from PDF
11
+ def extract_text_from_pdf(pdf_file):
12
+     reader = PdfReader(pdf_file)
13
+     text = ""
14
+     for page in reader.pages:
15
+         text += page.extract_text()
16
+     return text
17
+
18
+ # Streamlit app UI
19
+ def main():
20
+     st.title("PDF Genre Classifier")
21
+     st.write("Upload a PDF file, and this app will classify its genres using zero-shot classification.")
22
+
23
+     # File uploader
24
+     pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
25
+     if pdf_file is not None:
26
+         st.write("Processing the PDF...")
27
+         text = extract_text_from_pdf(pdf_file)
28
+
29
+         if text.strip():
30
+             st.write("PDF Text Extracted. Performing Genre Classification...")
31
+             classifier = load_classifier()
32
+
33
+             # Define candidate genres
34
+             candidate_labels =["Romance", "Mystery", "Thriller", "Science Fiction", "Fantasy", "Horror", "Historical Fiction", "Crime", "Western", "Dystopian", "Biography", "Autobiography", "Memoir", "History", "Self-Help", "Travel", "Essay", "Journalism", "Sonnet", "Haiku", "Free Verse", "Narrative Poetry", "Lyric Poetry", "Tragedy", "Comedy", "Melodrama", "Farce", "Graphic Novel", "Epistolary", "Magical Realism", "Satire", Young Adult Fiction"]
35
+
36
+             # Perform zero-shot classification
37
+             result = classifier(text),#[:1000], candidate_labels, multi_label=True)  # Using the first 1000 characters
38
+             genres = sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True)
39
+
40
+             st.subheader("Top 20 Detected Genres:")
41
+             top_genres = genres[:20]  # Get the top 20 genres
42
+             for genre, score in top_genres:
43
+                 st.write(f"**{genre.capitalize()}**: {score:.2f}")
44
+         else:
45
+             st.error("No text could be extracted from the PDF. Please try another file.")
46
+