Spaces:
Sleeping
Sleeping
File size: 2,993 Bytes
a42faad e972bc5 a42faad 8aedc3d cd2380a 5bd8bbc 16e1ff8 8aedc3d a42faad 474aba9 d70b0c6 a1835af 7f359fe a42faad 2afcbac 07b93af 00ff12b 07b93af 00ff12b d70b0c6 a1835af fdb40a4 2d55565 fdb40a4 e9af6cc fdb40a4 c3fcdee 7c913a6 fdb40a4 a42faad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline
# Zero-shot classification pipeline
@st.cache_resource
#def load_classifier():
#return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# Streamlit app UI
#def main():
#st.title("PDF Genre Classifier")
#st.write("Upload a PDF file, and this app will classify its genres using zero-shot classification.")
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# File uploader
title = st.title("BOOK GENRE PREDICTION APP")
print(title)
sub = st.write("Upload a book(pdf format), and this app will predict the genres in the book.")
print(sub)
pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
if pdf_file is not None:
st.write("Processing the PDF...")
text = extract_text_from_pdf(pdf_file)
if text.strip():
st.write("PDF Text Extracted. Predicting the Genres...")
classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli") #load_classifier()
# Define candidate genres
candidate_labels = ["Scientific Papers", "Technical Documentation", "Research Reports", "Academic Journals", "White Papers", "Technical Manuals", "Patents", "Software Documentation", "Engineering Specifications", "Computer Science Literature", "Machine Learning Publications", "Data Science Reports", "Network Architecture Descriptions", "Cybersecurity Analysis", "Algorithm Descriptions", "Fantasy", "Science Fiction", "Mystery", "Thriller", "Romance", "Historical Fiction", "Horror", "Adventure", "Crime", "Western", "Dystopian", "Magical Realism", "Young Adult", "Children's Literature", "Gothic", "Biography", "Autobiography", "Memoir", "Travel Writing", "History", "Philosophy", "Psychology", "Self-Help", "Political Commentary", "True Crime", "Nature Writing", "Cultural Studies", "Sociology", "Anthropology", "Religious Studies", "Poetry", "Drama", "Epic", "Short Story", "Novel", "Novella", "Satire", "Tragedy", "Comedy", "Tragicomedy", "News Reporting", "Feature Writing", "Opinion Pieces", "Investigative Journalism", "Editorial", "Profile Writing", "Sports Writing", "Political Journalism", "Dissertation", "Thesis", "Critical Analysis", "Comparative Study", "Literature Review", "Meta-Analysis", " Case Study"] #
# Perform zero-shot classification
result = classifier(text[:3000], candidate_labels, multi_label=True) #[:1000]), candidate_labels, multi_label=True)
genres = sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True)
st.subheader("Top 20 Detected Genres:")
top_genres = genres[:20]
for genre, score in top_genres:
st.write(f"**{genre.capitalize()}**: {score:.2f}")
else:
st.error("No text could be extracted from the PDF. Please try another file.")
|