Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1780,7 +1780,7 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 1780 |
highlighted=[]
|
| 1781 |
processed_subjects = set() # Initialize at the top of testFunction
|
| 1782 |
toc_pages = get_toc_page_numbers(doc)
|
| 1783 |
-
identified_headers=process_document_in_chunks(len(doc), pdf_path,LLM_prompt, model)
|
| 1784 |
# identified_headers = identify_headers_with_openrouterNEWW(doc, api_key='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')# ['text', fontsize, page number,y]
|
| 1785 |
|
| 1786 |
# with open("identified_headers.txt", "w", encoding="utf-8") as f:
|
|
@@ -2232,6 +2232,19 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 2232 |
highlight_boxes(docHighlights, page_highlights,stringtowrite)
|
| 2233 |
|
| 2234 |
print("Current working directory:", os.getcwd())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2235 |
if data_list_JSON and not data_list_JSON[-1]["BodyText"] and collected_lines:
|
| 2236 |
data_list_JSON[-1]["BodyText"] = collected_lines[1:] if len(collected_lines) > 0 else []
|
| 2237 |
# Final cleanup of the JSON data before returning
|
|
@@ -2244,7 +2257,8 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 2244 |
|
| 2245 |
# If they match or the subject is inside the first line, remove it
|
| 2246 |
if subject in first_line or first_line in subject:
|
| 2247 |
-
entry["BodyText"] = entry["BodyText"][1:]
|
|
|
|
| 2248 |
jsons.append(data_list_JSON)
|
| 2249 |
logger.info(f"Markups done! Uploading to dropbox")
|
| 2250 |
logger.info(f"Uploaded and Readyy!")
|
|
|
|
| 1780 |
highlighted=[]
|
| 1781 |
processed_subjects = set() # Initialize at the top of testFunction
|
| 1782 |
toc_pages = get_toc_page_numbers(doc)
|
| 1783 |
+
identified_headers=process_document_in_chunks(len(doc), pdf_path, LLM_prompt, model)
|
| 1784 |
# identified_headers = identify_headers_with_openrouterNEWW(doc, api_key='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')# ['text', fontsize, page number,y]
|
| 1785 |
|
| 1786 |
# with open("identified_headers.txt", "w", encoding="utf-8") as f:
|
|
|
|
| 2232 |
highlight_boxes(docHighlights, page_highlights,stringtowrite)
|
| 2233 |
|
| 2234 |
print("Current working directory:", os.getcwd())
|
| 2235 |
+
|
| 2236 |
+
docHighlights.save("highlighted_output.pdf")
|
| 2237 |
+
|
| 2238 |
+
# dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
|
| 2239 |
+
# metadata = dbxTeam.sharing_get_shared_link_metadata(pdf_path)
|
| 2240 |
+
# dbPath = '/TSA JOBS/ADR Test/FIND/'
|
| 2241 |
+
# pdf_bytes = BytesIO()
|
| 2242 |
+
# docHighlights.save(pdf_bytes)
|
| 2243 |
+
# pdflink = tsadropboxretrieval.uploadanyFile(doc=docHighlights, path=dbPath, pdfname=filename)
|
| 2244 |
+
# json_output=changepdflinks(json_output,pdflink)
|
| 2245 |
+
# return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled , alltextWithoutNotbilled , filename
|
| 2246 |
+
# Final safety check: if the very last entry in our list has an empty BodyText,
|
| 2247 |
+
# but we have collected_lines, sync them.
|
| 2248 |
if data_list_JSON and not data_list_JSON[-1]["BodyText"] and collected_lines:
|
| 2249 |
data_list_JSON[-1]["BodyText"] = collected_lines[1:] if len(collected_lines) > 0 else []
|
| 2250 |
# Final cleanup of the JSON data before returning
|
|
|
|
| 2257 |
|
| 2258 |
# If they match or the subject is inside the first line, remove it
|
| 2259 |
if subject in first_line or first_line in subject:
|
| 2260 |
+
entry["BodyText"] = entry["BodyText"][1:]
|
| 2261 |
+
|
| 2262 |
jsons.append(data_list_JSON)
|
| 2263 |
logger.info(f"Markups done! Uploading to dropbox")
|
| 2264 |
logger.info(f"Uploaded and Readyy!")
|