Spaces:

Shankarm08
/

pdfreader

Sleeping

App Files Files Community

Shankarm08 commited on Oct 5, 2024

Commit

c5608b5

verified ·

1 Parent(s): e3d3e2d

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -44

app.py CHANGED Viewed

@@ -1,25 +1,26 @@
-import gradio as gr
 import torch
 from transformers import BertTokenizer, BertModel
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
 import pdfplumber
-app = FastAPI()
-class TextClassificationRequest(BaseModel):
-    text: str
-@app.post("/classify")
-async def classify_text(request: TextClassificationRequest):
-    # Load the pre-trained BERT model and tokenizer
-    model_name = "bert-base-uncased"
-    tokenizer = BertTokenizer.from_pretrained(model_name)
-    model = BertModel.from_pretrained(model_name)
     # Preprocess the input text
     inputs = tokenizer.encode_plus(
-        request.text,
         add_special_tokens=True,
         max_length=512,
         return_attention_mask=True,
@@ -32,38 +33,22 @@ async def classify_text(request: TextClassificationRequest):
     # Extract the features
     features = outputs.last_hidden_state[:, 0, :]
-    # Return the features as a list
-    return {"features": features.tolist()}
-# Define a function to extract text from a PDF
-def extract_text_from_pdf(pdf_file):
-    with pdfplumber.open(pdf_file) as pdf:
-        text = ""
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
-# Create a Gradio interface for handling PDF input
-def classify_pdf(pdf_file):
     # Extract text from the uploaded PDF
     extracted_text = extract_text_from_pdf(pdf_file)
-    # Create the request for FastAPI
-    request = TextClassificationRequest(text=extracted_text)
-    # Simulate calling the FastAPI endpoint
-    output = classify_text(request)
-    return output
-# Create a Gradio interface
-interface = gr.Interface(
-    fn=classify_pdf,
-    inputs="file",  # Expecting PDF file input
-    outputs="json",  # Outputs a JSON dictionary
-    title="PDF Text Classification",
-    description="Upload a PDF file to classify its text using BERT"
-)
-# Launch the Gradio interface
-interface.launch(server_port=7861)

+import streamlit as st
 import torch
 from transformers import BertTokenizer, BertModel
 import pdfplumber
+# Load the pre-trained BERT model and tokenizer outside the function for efficiency
+model_name = "bert-base-uncased"
+tokenizer = BertTokenizer.from_pretrained(model_name)
+model = BertModel.from_pretrained(model_name)
+# Define a function to extract text from a PDF
+def extract_text_from_pdf(pdf_file):
+    with pdfplumber.open(pdf_file) as pdf:
+        text = ""
+        for page in pdf.pages:
+            text += page.extract_text()
+    return text
+# Define a function to classify the extracted text
+def classify_text(text):
     # Preprocess the input text
     inputs = tokenizer.encode_plus(
+        text,
         add_special_tokens=True,
         max_length=512,
         return_attention_mask=True,
     # Extract the features
     features = outputs.last_hidden_state[:, 0, :]
+    return features.tolist()
+# Streamlit app setup
+st.title("PDF Text Classification")
+st.write("Upload a PDF file to classify its text using BERT")
+# File uploader for PDFs
+pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
+if pdf_file is not None:
     # Extract text from the uploaded PDF
     extracted_text = extract_text_from_pdf(pdf_file)
+    st.write("Extracted Text:")
+    st.write(extracted_text)
+    # Classify the extracted text
+    if st.button("Classify"):
+        features = classify_text(extracted_text)
+        st.json({"features": features})  # Display the features in JSON format