Marthee commited on
Commit
adf7f0e
·
verified ·
1 Parent(s): b53a59b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -2
app.py CHANGED
@@ -1780,7 +1780,7 @@ def testFunction(pdf_path, model,LLM_prompt):
1780
  highlighted=[]
1781
  processed_subjects = set() # Initialize at the top of testFunction
1782
  toc_pages = get_toc_page_numbers(doc)
1783
- identified_headers=process_document_in_chunks(len(doc), pdf_path,LLM_prompt, model)
1784
  # identified_headers = identify_headers_with_openrouterNEWW(doc, api_key='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')# ['text', fontsize, page number,y]
1785
 
1786
  # with open("identified_headers.txt", "w", encoding="utf-8") as f:
@@ -2232,6 +2232,19 @@ def testFunction(pdf_path, model,LLM_prompt):
2232
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2233
 
2234
  print("Current working directory:", os.getcwd())
 
 
 
 
 
 
 
 
 
 
 
 
 
2235
  if data_list_JSON and not data_list_JSON[-1]["BodyText"] and collected_lines:
2236
  data_list_JSON[-1]["BodyText"] = collected_lines[1:] if len(collected_lines) > 0 else []
2237
  # Final cleanup of the JSON data before returning
@@ -2244,7 +2257,8 @@ def testFunction(pdf_path, model,LLM_prompt):
2244
 
2245
  # If they match or the subject is inside the first line, remove it
2246
  if subject in first_line or first_line in subject:
2247
- entry["BodyText"] = entry["BodyText"][1:]
 
2248
  jsons.append(data_list_JSON)
2249
  logger.info(f"Markups done! Uploading to dropbox")
2250
  logger.info(f"Uploaded and Readyy!")
 
1780
  highlighted=[]
1781
  processed_subjects = set() # Initialize at the top of testFunction
1782
  toc_pages = get_toc_page_numbers(doc)
1783
+ identified_headers=process_document_in_chunks(len(doc), pdf_path, LLM_prompt, model)
1784
  # identified_headers = identify_headers_with_openrouterNEWW(doc, api_key='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')# ['text', fontsize, page number,y]
1785
 
1786
  # with open("identified_headers.txt", "w", encoding="utf-8") as f:
 
2232
  highlight_boxes(docHighlights, page_highlights,stringtowrite)
2233
 
2234
  print("Current working directory:", os.getcwd())
2235
+
2236
+ docHighlights.save("highlighted_output.pdf")
2237
+
2238
+ # dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
2239
+ # metadata = dbxTeam.sharing_get_shared_link_metadata(pdf_path)
2240
+ # dbPath = '/TSA JOBS/ADR Test/FIND/'
2241
+ # pdf_bytes = BytesIO()
2242
+ # docHighlights.save(pdf_bytes)
2243
+ # pdflink = tsadropboxretrieval.uploadanyFile(doc=docHighlights, path=dbPath, pdfname=filename)
2244
+ # json_output=changepdflinks(json_output,pdflink)
2245
+ # return pdf_bytes.getvalue(), docHighlights , json_output , Alltexttobebilled , alltextWithoutNotbilled , filename
2246
+ # Final safety check: if the very last entry in our list has an empty BodyText,
2247
+ # but we have collected_lines, sync them.
2248
  if data_list_JSON and not data_list_JSON[-1]["BodyText"] and collected_lines:
2249
  data_list_JSON[-1]["BodyText"] = collected_lines[1:] if len(collected_lines) > 0 else []
2250
  # Final cleanup of the JSON data before returning
 
2257
 
2258
  # If they match or the subject is inside the first line, remove it
2259
  if subject in first_line or first_line in subject:
2260
+ entry["BodyText"] = entry["BodyText"][1:]
2261
+
2262
  jsons.append(data_list_JSON)
2263
  logger.info(f"Markups done! Uploading to dropbox")
2264
  logger.info(f"Uploaded and Readyy!")