Fluospark128's picture
Update app.py
e9af6cc verified
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline
# Zero-shot classification pipeline
@st.cache_resource
#def load_classifier():
#return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Streamlit app UI
#def main():
#st.title("PDF Genre Classifier")
#st.write("Upload a PDF file, and this app will classify its genres using zero-shot classification.")
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# File uploader
title = st.title("BOOK GENRE PREDICTION APP")
print(title)
sub = st.write("Upload a book(pdf format), and this app will predict the genres in the book.")
print(sub)
pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
if pdf_file is not None:
st.write("Processing the PDF...")
text = extract_text_from_pdf(pdf_file)
if text.strip():
st.write("PDF Text Extracted. Predicting the Genres...")
classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli") #load_classifier()
# Define candidate genres
candidate_labels = ["Scientific Papers", "Technical Documentation", "Research Reports", "Academic Journals", "White Papers", "Technical Manuals", "Patents", "Software Documentation", "Engineering Specifications", "Computer Science Literature", "Machine Learning Publications", "Data Science Reports", "Network Architecture Descriptions", "Cybersecurity Analysis", "Algorithm Descriptions", "Fantasy", "Science Fiction", "Mystery", "Thriller", "Romance", "Historical Fiction", "Horror", "Adventure", "Crime", "Western", "Dystopian", "Magical Realism", "Young Adult", "Children's Literature", "Gothic", "Biography", "Autobiography", "Memoir", "Travel Writing", "History", "Philosophy", "Psychology", "Self-Help", "Political Commentary", "True Crime", "Nature Writing", "Cultural Studies", "Sociology", "Anthropology", "Religious Studies", "Poetry", "Drama", "Epic", "Short Story", "Novel", "Novella", "Satire", "Tragedy", "Comedy", "Tragicomedy", "News Reporting", "Feature Writing", "Opinion Pieces", "Investigative Journalism", "Editorial", "Profile Writing", "Sports Writing", "Political Journalism", "Dissertation", "Thesis", "Critical Analysis", "Comparative Study", "Literature Review", "Meta-Analysis", " Case Study"] #
# Perform zero-shot classification
result = classifier(text[:3000], candidate_labels, multi_label=True) #[:1000]), candidate_labels, multi_label=True)
genres = sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True)
st.subheader("Top 20 Detected Genres:")
top_genres = genres[:20]
for genre, score in top_genres:
st.write(f"**{genre.capitalize()}**: {score:.2f}")
else:
st.error("No text could be extracted from the PDF. Please try another file.")