Spaces:
Runtime error
Runtime error
Commit ·
737df3f
1
Parent(s): 59122b6
Upload folder using huggingface_hub
Browse files- pdfparser.py +0 -33
pdfparser.py
CHANGED
|
@@ -1,16 +1,12 @@
|
|
| 1 |
-
import io
|
| 2 |
import os
|
| 3 |
|
| 4 |
-
import boto3
|
| 5 |
from langchain.document_loaders import PyPDFium2Loader
|
| 6 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
from langchain.vectorstores import FAISS
|
| 9 |
-
from pdf2image import convert_from_path
|
| 10 |
from sllim import chat
|
| 11 |
|
| 12 |
# Standard Textract client setup
|
| 13 |
-
textract_client = boto3.client("textract")
|
| 14 |
template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
|
| 15 |
DOCUMENTS:
|
| 16 |
{docs}
|
|
@@ -21,29 +17,6 @@ QUERY:
|
|
| 21 |
embeddings = OpenAIEmbeddings()
|
| 22 |
|
| 23 |
|
| 24 |
-
def convert_pdf_to_text(pdf_file_path: str):
|
| 25 |
-
# Convert the PDF to an in-memory image format
|
| 26 |
-
images = convert_from_path(pdf_file_path)
|
| 27 |
-
|
| 28 |
-
docs = []
|
| 29 |
-
for image in images:
|
| 30 |
-
# Convert the image into byte stream
|
| 31 |
-
with io.BytesIO() as image_stream:
|
| 32 |
-
image.save(image_stream, "JPEG")
|
| 33 |
-
image_bytes = image_stream.getvalue()
|
| 34 |
-
|
| 35 |
-
# Use Textract to detect text in the local image
|
| 36 |
-
response = textract_client.detect_document_text(Document={"Bytes": image_bytes})
|
| 37 |
-
|
| 38 |
-
text = ""
|
| 39 |
-
# Print the detected text blocks
|
| 40 |
-
for item in response["Blocks"]:
|
| 41 |
-
if item["BlockType"] == "LINE":
|
| 42 |
-
text += item["Text"] + "\n"
|
| 43 |
-
docs.append(text)
|
| 44 |
-
return docs
|
| 45 |
-
|
| 46 |
-
|
| 47 |
def process_file(file_path):
|
| 48 |
index_path = get_index_name(file_path)
|
| 49 |
if os.path.exists(index_path):
|
|
@@ -59,9 +32,6 @@ def process_file(file_path):
|
|
| 59 |
length_function=len,
|
| 60 |
)
|
| 61 |
docs = text_splitter.split_documents(data)
|
| 62 |
-
if len(docs) == 0:
|
| 63 |
-
data = convert_pdf_to_text(file_path)
|
| 64 |
-
docs = text_splitter.create_documents(data)
|
| 65 |
|
| 66 |
# Embed paragraphs
|
| 67 |
db = FAISS.from_documents(docs, embeddings)
|
|
@@ -118,9 +88,6 @@ def ask_question(query, upload_file, history=None):
|
|
| 118 |
length_function=len,
|
| 119 |
)
|
| 120 |
docs = text_splitter.split_documents(data)
|
| 121 |
-
if len(docs) == 0:
|
| 122 |
-
data = convert_pdf_to_text(file_path)
|
| 123 |
-
docs = text_splitter.create_documents(data)
|
| 124 |
|
| 125 |
# Embed paragraphs
|
| 126 |
db = FAISS.from_documents(docs, embeddings)
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
| 3 |
from langchain.document_loaders import PyPDFium2Loader
|
| 4 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
from langchain.vectorstores import FAISS
|
|
|
|
| 7 |
from sllim import chat
|
| 8 |
|
| 9 |
# Standard Textract client setup
|
|
|
|
| 10 |
template = """I will give you a couple of paragraphs from a PDF document along with a question about the document. You will provide an answer as accurately as possible and provide citations for why that answer is correct.
|
| 11 |
DOCUMENTS:
|
| 12 |
{docs}
|
|
|
|
| 17 |
embeddings = OpenAIEmbeddings()
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def process_file(file_path):
|
| 21 |
index_path = get_index_name(file_path)
|
| 22 |
if os.path.exists(index_path):
|
|
|
|
| 32 |
length_function=len,
|
| 33 |
)
|
| 34 |
docs = text_splitter.split_documents(data)
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Embed paragraphs
|
| 37 |
db = FAISS.from_documents(docs, embeddings)
|
|
|
|
| 88 |
length_function=len,
|
| 89 |
)
|
| 90 |
docs = text_splitter.split_documents(data)
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Embed paragraphs
|
| 93 |
db = FAISS.from_documents(docs, embeddings)
|