Spaces:

fajjos
/

pdf_model

Sleeping

App Files Files Community

fajjos commited on Dec 9, 2024

Commit

72c8ec0

verified ·

1 Parent(s): be4b0cf

Create app.py

Browse files

Files changed (1) hide show

app.py +78 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import streamlit as st
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from PyPDF2 import PdfReader
+import torch
+from typing import List
+# Load the model and tokenizer from Hugging Face
+model_name = "fajjos/pdf_model"  # Replace with your model name
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Function to extract text from a single PDF
+def extract_text_from_pdf(pdf_file: str) -> str:
+    """
+    Extracts text from a single PDF file using PyPDF2.
+    """
+    pdf_reader = PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    return text
+# Function to search for a keyword in the extracted PDF texts
+def search_keyword_in_pdfs(keyword: str, pdf_texts: dict) -> List[str]:
+    """
+    Search for the keyword in the uploaded PDFs and return the list of PDF names.
+    """
+    found_pdfs = []
+    for pdf_name, pdf_text in pdf_texts.items():
+        prompt = f"Does the keyword '{keyword}' appear in the following text? If yes, provide details.\n\n{pdf_text}"
+        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+        outputs = model.generate(inputs.input_ids, max_new_tokens=20000)
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # If keyword is found in the response
+        if keyword.lower() in response.lower():
+            found_pdfs.append(pdf_name)
+    return found_pdfs
+# Function to process all PDFs in a specified folder
+def process_pdfs_in_folder(folder_path: str) -> dict:
+    """
+    Extracts text from all PDFs in the specified folder and stores it in a dictionary.
+    """
+    pdf_texts = {}
+    for file_name in os.listdir(folder_path):
+        if file_name.endswith(".pdf"):  # Check if the file is a PDF
+            file_path = os.path.join(folder_path, file_name)
+            pdf_texts[file_name] = extract_text_from_pdf(file_path)
+    return pdf_texts
+# Streamlit UI for folder path and keyword input
+st.title("PDF Keyword Search")
+folder_path = st.text_input("Enter the folder path containing PDFs:").strip()
+keyword = st.text_input("Enter the keyword to search for:")
+if st.button("Search"):
+    if not folder_path or not keyword:
+        st.error("Please provide both the folder path and the keyword.")
+    else:
+        try:
+            # Process all PDFs in the folder
+            pdf_texts = process_pdfs_in_folder(folder_path)
+            # Perform keyword search in the extracted texts
+            found_pdfs = search_keyword_in_pdfs(keyword, pdf_texts)
+            # Display results
+            if found_pdfs:
+                st.write(f"The keyword '{keyword}' was found in the following PDF files:")
+                for pdf in found_pdfs:
+                    st.write(f"- {pdf}")
+            else:
+                st.write(f"The keyword '{keyword}' was not found in any PDFs in the folder '{folder_path}'.")
+        except Exception as e:
+            st.error(f"Error: {e}")