Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -110,29 +110,23 @@ def seperate_image_text_from_pdf(pdf_url):
|
|
| 110 |
|
| 111 |
def pdf_image_text_embedding_and_text_embedding(pages_info):
|
| 112 |
try:
|
| 113 |
-
# List to store page embeddings
|
| 114 |
page_embeddings = []
|
| 115 |
|
| 116 |
# Iterate through each page
|
| 117 |
for page in pages_info:
|
| 118 |
# Extract text from the page
|
| 119 |
-
text = page
|
|
|
|
| 120 |
|
| 121 |
-
# Extract images from the page
|
| 122 |
-
images = page["images"]
|
| 123 |
-
|
| 124 |
-
# List to store image embeddings
|
| 125 |
image_embeddings = []
|
| 126 |
-
|
| 127 |
-
# Iterate through each image
|
| 128 |
for image in images:
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
|
| 137 |
# Store the page embeddings in a dictionary
|
| 138 |
page_embedding = {
|
|
@@ -140,12 +134,11 @@ def pdf_image_text_embedding_and_text_embedding(pages_info):
|
|
| 140 |
"text": text,
|
| 141 |
}
|
| 142 |
|
| 143 |
-
# Append the page embedding to the list
|
| 144 |
page_embeddings.append(page_embedding)
|
| 145 |
|
| 146 |
return page_embeddings
|
| 147 |
except Exception as e:
|
| 148 |
-
print("An error occurred:
|
| 149 |
return "Error"
|
| 150 |
|
| 151 |
|
|
|
|
| 110 |
|
| 111 |
def pdf_image_text_embedding_and_text_embedding(pages_info):
|
| 112 |
try:
|
|
|
|
| 113 |
page_embeddings = []
|
| 114 |
|
| 115 |
# Iterate through each page
|
| 116 |
for page in pages_info:
|
| 117 |
# Extract text from the page
|
| 118 |
+
text = page.get("text", "")
|
| 119 |
+
images = page.get("images", [])
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
image_embeddings = []
|
|
|
|
|
|
|
| 122 |
for image in images:
|
| 123 |
+
try:
|
| 124 |
+
image_embedding = get_image_embedding(image)
|
| 125 |
+
extracted_text = extract_text(image)
|
| 126 |
+
image_embeddings.append({"image_embedding": image_embedding.tolist(), "extracted_text": extracted_text})
|
| 127 |
+
except Exception as image_error:
|
| 128 |
+
print(f"Error processing image: {image_error}")
|
| 129 |
+
# Log the error or handle it as needed
|
| 130 |
|
| 131 |
# Store the page embeddings in a dictionary
|
| 132 |
page_embedding = {
|
|
|
|
| 134 |
"text": text,
|
| 135 |
}
|
| 136 |
|
|
|
|
| 137 |
page_embeddings.append(page_embedding)
|
| 138 |
|
| 139 |
return page_embeddings
|
| 140 |
except Exception as e:
|
| 141 |
+
print(f"An error occurred: {e}")
|
| 142 |
return "Error"
|
| 143 |
|
| 144 |
|