Spaces:

EN-collab
/

HQ_Project_EN

Sleeping

App Files Files Community

1mpreccable commited on Apr 1, 2025

Commit

44be36b

1 Parent(s): e44b3be

added pdf tools

Browse files

Files changed (6) hide show

pages/Project_4_-_NLP_and_PDF_analyser.py +74 -4
requirements.txt +5 -1
src/__pycache__/functions_langchain.cpython-311.pyc +0 -0
src/__pycache__/functions_pdf.cpython-311.pyc +0 -0
src/__pycache__/functions_scrapper.cpython-311.pyc +0 -0
src/functions_pdf.py +69 -0

pages/Project_4_-_NLP_and_PDF_analyser.py CHANGED Viewed

@@ -1,9 +1,79 @@
 import streamlit as st
 ################################################################################
-st.sidebar.title("App parameters")
-st.write("This is the NLP and PDF analyser page. It is still under construction.")
-st.write("Please come back later.")

 import streamlit as st
+import os
+from src.functions_pdf import pymupdf_pdf_to_text, pypdf2_pdf_to_text, pdfminer_pdf_to_text, pdfplumber_pdf_to_text
 ################################################################################
+# Sidebar for parameters
+st.sidebar.title("App Parameters")
+# Select method in the sidebar
+method = st.sidebar.selectbox(
+    "Select the method to extract text from PDF",
+    ("PyMuPDF", "PyPDF2", "pdfminer", "pdfplumber", "reportlab"),
+)
+# Main page title and description
+st.title("NLP and PDF Analyser")
+st.markdown(
+    """
+    This tool allows you to extract text from PDF files using different methods.
+    Select a method, upload a PDF file, and extract the text.
+    """
+)
+st.divider()
+# File uploader
+file = st.file_uploader("Upload a PDF file", type=["pdf"])
+if file is not None:
+    # Display file details
+    st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
+    # Extract text button
+    if st.button("Extract Text"):
+        # Save the uploaded file to a temporary location
+        with open("temp_uploaded_file.pdf", "wb") as temp_file:
+            temp_file.write(file.read())
+        # Use the temporary file path for processing
+        temp_file_path = "temp_uploaded_file.pdf"
+        # Extract text based on the selected method
+        st.subheader("Extracted Text")
+        if method == "PyMuPDF":
+            st.write("Using **PyMuPDF** for text extraction.")
+            text = pymupdf_pdf_to_text(temp_file_path)
+        elif method == "PyPDF2":
+            st.write("Using **PyPDF2** for text extraction.")
+            text = pypdf2_pdf_to_text(temp_file_path)
+        elif method == "pdfminer":
+            st.write("Using **pdfminer** for text extraction.")
+            text = pdfminer_pdf_to_text(temp_file_path)
+        elif method == "pdfplumber":
+            st.write("Using **pdfplumber** for text extraction.")
+            text = pdfplumber_pdf_to_text(temp_file_path)
+        ################################################################################
+        # Clean up the temporary file
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        ################################################################################
+        else:
+            st.error("Invalid method selected.")
+            text = ""
+        # Display extracted text
+        if text:
+            st.text_area("Extracted Text", text, height=300)
+            # Download button for extracted text
+            st.download_button(
+                label="Download Extracted Text",
+                data=text,
+                file_name="extracted_text.txt",
+                mime="text/plain",
+            )
+        else:
+            st.warning("No text extracted. Please check the PDF file or method.")
+else:
+    st.warning("Please upload a PDF file to proceed.")

requirements.txt CHANGED Viewed

@@ -14,4 +14,8 @@ langchain-core
 langgraph>0.2.27
 sentry-sdk
 langchain-mongodb
-langchain-huggingface

 langgraph>0.2.27
 sentry-sdk
 langchain-mongodb
+langchain-huggingface
+PyMuPDF
+PyPDF2
+pdfminer.six
+pdfplumber

src/__pycache__/functions_langchain.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/functions_langchain.cpython-311.pyc and b/src/__pycache__/functions_langchain.cpython-311.pyc differ

src/__pycache__/functions_pdf.cpython-311.pyc ADDED Viewed

Binary file (2.49 kB). View file

src/__pycache__/functions_scrapper.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/functions_scrapper.cpython-311.pyc and b/src/__pycache__/functions_scrapper.cpython-311.pyc differ

src/functions_pdf.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pymupdf
+from PyPDF2 import PdfReader
+from pdfminer.high_level import extract_text
+from langchain.document_loaders import PDFPlumberLoader
+def pymupdf_pdf_to_text(file_path):
+    """
+    Extract text from a PDF file using PyMuPDF.
+    Args:
+        file_path (str): Path to the PDF file.
+    Returns:
+        str: Extracted text from the PDF file.
+    """
+    doc = pymupdf.open(stream=file_path.read(), filetype="pdf")
+    text = ""
+    for page in doc:
+        text += page.get_text()  + "\n"
+    return text
+def pypdf2_pdf_to_text(file_path):
+    """
+    Extract text from a PDF file using PyPDF2.
+    Args:
+        file_path (str): Path to the PDF file.
+    Returns:
+        str: Extracted text from the PDF file.
+    """
+    reader = PdfReader(file_path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + "\n"
+    return text
+def pdfminer_pdf_to_text(file_path):
+    """
+    Extract text from a PDF file using pdfminer.
+    Args:
+        file_path (str): Path to the PDF file.
+    Returns:
+        str: Extracted text from the PDF file.
+    """
+    # Implementation for pdfminer extraction goes here
+    text = extract_text(file_path)
+    return text
+def pdfplumber_pdf_to_text(file_path):
+    """
+    Extract text from a PDF file using pdfplumber.
+    Args:
+        file_path (str): Path to the PDF file.
+    Returns:
+        str: Extracted text from the PDF file.
+    """
+    loader = PDFPlumberLoader(file_path)
+    documents = loader.load()
+    text = ""
+    for doc in documents:
+        text += doc.page_content + "\n"
+    return text