Spaces:

kaiserpister
/

demo-pdfchat

Runtime error

App Files Files Community

kaiserpister commited on Sep 15, 2023

Commit

737df3f

1 Parent(s): 59122b6

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

pdfparser.py +0 -33

pdfparser.py CHANGED Viewed

@@ -1,16 +1,12 @@
-import io
 import os
-import boto3
 from langchain.document_loaders import PyPDFium2Loader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
-from pdf2image import convert_from_path
 from sllim import chat
 # Standard Textract client setup
-textract_client = boto3.client("textract")
 template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
 DOCUMENTS:
 {docs}
@@ -21,29 +17,6 @@ QUERY:
 embeddings = OpenAIEmbeddings()
-def convert_pdf_to_text(pdf_file_path: str):
-    # Convert the PDF to an in-memory image format
-    images = convert_from_path(pdf_file_path)
-    docs = []
-    for image in images:
-        # Convert the image into byte stream
-        with io.BytesIO() as image_stream:
-            image.save(image_stream, "JPEG")
-            image_bytes = image_stream.getvalue()
-        # Use Textract to detect text in the local image
-        response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
-        text = ""
-        # Print the detected text blocks
-        for item in response["Blocks"]:
-            if item["BlockType"] == "LINE":
-                text += item["Text"] + "\n"
-        docs.append(text)
-    return docs
 def process_file(file_path):
     index_path = get_index_name(file_path)
     if os.path.exists(index_path):
@@ -59,9 +32,6 @@ def process_file(file_path):
         length_function=len,
     )
     docs = text_splitter.split_documents(data)
-    if len(docs) == 0:
-        data = convert_pdf_to_text(file_path)
-        docs = text_splitter.create_documents(data)
     # Embed paragraphs
     db = FAISS.from_documents(docs, embeddings)
@@ -118,9 +88,6 @@ def ask_question(query, upload_file, history=None):
             length_function=len,
         )
         docs = text_splitter.split_documents(data)
-        if len(docs) == 0:
-            data = convert_pdf_to_text(file_path)
-            docs = text_splitter.create_documents(data)
         # Embed paragraphs
         db = FAISS.from_documents(docs, embeddings)

 import os
 from langchain.document_loaders import PyPDFium2Loader
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from sllim import chat
 # Standard Textract client setup
 template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
 DOCUMENTS:
 {docs}
 embeddings = OpenAIEmbeddings()
 def process_file(file_path):
     index_path = get_index_name(file_path)
     if os.path.exists(index_path):
         length_function=len,
     )
     docs = text_splitter.split_documents(data)
     # Embed paragraphs
     db = FAISS.from_documents(docs, embeddings)
             length_function=len,
         )
         docs = text_splitter.split_documents(data)
         # Embed paragraphs
         db = FAISS.from_documents(docs, embeddings)