Spaces:

Fluospark128
/

Genre_Prediction_App

Sleeping

File size: 2,993 Bytes

a42faad
 
 
 
 
 
e972bc5
 
a42faad
8aedc3d
cd2380a
5bd8bbc
16e1ff8
8aedc3d
 
a42faad
 
474aba9
d70b0c6
 
a1835af
7f359fe
a42faad
2afcbac
07b93af
00ff12b
07b93af
00ff12b
d70b0c6
 
 
a1835af
fdb40a4
2d55565
fdb40a4
 
e9af6cc
fdb40a4
c3fcdee
7c913a6
 
fdb40a4
 
 
 
 
a42faad

import streamlit as st
from PyPDF2 import PdfReader
from transformers import pipeline

# Zero-shot classification pipeline
@st.cache_resource
#def load_classifier():
    #return pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Streamlit app UI
#def main():
#st.title("PDF Genre Classifier")
#st.write("Upload a PDF file, and this app will classify its genres using zero-shot classification.")


# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# File uploader
title = st.title("BOOK GENRE PREDICTION APP")
print(title)
sub = st.write("Upload a book(pdf format), and this app will predict the genres in the book.")
print(sub)
pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
if pdf_file is not None:
    st.write("Processing the PDF...")
    text = extract_text_from_pdf(pdf_file)
    if text.strip():
        st.write("PDF Text Extracted. Predicting the Genres...")
        classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli") #load_classifier()
        # Define candidate genres
        candidate_labels = ["Scientific Papers", "Technical Documentation", "Research Reports", "Academic Journals", "White Papers", "Technical Manuals", "Patents", "Software Documentation", "Engineering Specifications", "Computer Science Literature", "Machine Learning Publications", "Data Science Reports", "Network Architecture Descriptions", "Cybersecurity Analysis", "Algorithm Descriptions", "Fantasy", "Science Fiction", "Mystery", "Thriller", "Romance", "Historical Fiction", "Horror", "Adventure", "Crime", "Western", "Dystopian", "Magical Realism", "Young Adult", "Children's Literature", "Gothic", "Biography", "Autobiography", "Memoir", "Travel Writing", "History", "Philosophy", "Psychology", "Self-Help", "Political Commentary", "True Crime", "Nature Writing", "Cultural Studies", "Sociology", "Anthropology", "Religious Studies", "Poetry", "Drama", "Epic", "Short Story", "Novel", "Novella", "Satire", "Tragedy", "Comedy", "Tragicomedy", "News Reporting", "Feature Writing", "Opinion Pieces", "Investigative Journalism", "Editorial", "Profile Writing", "Sports Writing", "Political Journalism", "Dissertation", "Thesis", "Critical Analysis", "Comparative Study", "Literature Review", "Meta-Analysis", " Case Study"]  # 
        # Perform zero-shot classification
        result = classifier(text[:3000], candidate_labels, multi_label=True) #[:1000]), candidate_labels, multi_label=True)
        genres = sorted(zip(result["labels"], result["scores"]), key=lambda x: x[1], reverse=True) 
        st.subheader("Top 20 Detected Genres:")
        top_genres = genres[:20]
        for genre, score in top_genres:
            st.write(f"**{genre.capitalize()}**: {score:.2f}")
    else:
        st.error("No text could be extracted from the PDF. Please try another file.")