Spaces:

Fluospark128
/

Genre_Prediction_App

Sleeping

App Files Files Community

Genre_Prediction_App / app.py

Fluospark128

Update app.py

e9af6cc verified 12 months ago

raw

history blame contribute delete

2.99 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from transformers import pipeline

	# Zero-shot classification pipeline
	@st.cache_resource
	#def load_classifier():
	#return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	# Streamlit app UI
	#def main():
	#st.title("PDF Genre Classifier")
	#st.write("Upload a PDF file, and this app will classify its genres using zero-shot classification.")


	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	# File uploader
	title = st.title("BOOK GENRE PREDICTION APP")
	print(title)
	sub = st.write("Upload a book(pdf format), and this app will predict the genres in the book.")
	print(sub)
	pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
	if pdf_file is not None:
	st.write("Processing the PDF...")
	text = extract_text_from_pdf(pdf_file)
	if text.strip():
	st.write("PDF Text Extracted. Predicting the Genres...")
	classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli") #load_classifier()
	# Define candidate genres
	candidate_labels = ["Scientific Papers", "Technical Documentation", "Research Reports", "Academic Journals", "White Papers", "Technical Manuals", "Patents", "Software Documentation", "Engineering Specifications", "Computer Science Literature", "Machine Learning Publications", "Data Science Reports", "Network Architecture Descriptions", "Cybersecurity Analysis", "Algorithm Descriptions", "Fantasy", "Science Fiction", "Mystery", "Thriller", "Romance", "Historical Fiction", "Horror", "Adventure", "Crime", "Western", "Dystopian", "Magical Realism", "Young Adult", "Children's Literature", "Gothic", "Biography", "Autobiography", "Memoir", "Travel Writing", "History", "Philosophy", "Psychology", "Self-Help", "Political Commentary", "True Crime", "Nature Writing", "Cultural Studies", "Sociology", "Anthropology", "Religious Studies", "Poetry", "Drama", "Epic", "Short Story", "Novel", "Novella", "Satire", "Tragedy", "Comedy", "Tragicomedy", "News Reporting", "Feature Writing", "Opinion Pieces", "Investigative Journalism", "Editorial", "Profile Writing", "Sports Writing", "Political Journalism", "Dissertation", "Thesis", "Critical Analysis", "Comparative Study", "Literature Review", "Meta-Analysis", " Case Study"] #
	# Perform zero-shot classification
	result = classifier(text[:3000], candidate_labels, multi_label=True) #[:1000]), candidate_labels, multi_label=True)
	genres = sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True)
	st.subheader("Top 20 Detected Genres:")
	top_genres = genres[:20]
	for genre, score in top_genres:
	st.write(f"{genre.capitalize()}: {score:.2f}")
	else:
	st.error("No text could be extracted from the PDF. Please try another file.")