Marthee commited on
Commit
4dbb8be
·
verified ·
1 Parent(s): 8c4ca9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -801,7 +801,7 @@ def openPDF(pdf_path):
801
  # return out
802
 
803
 
804
- def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
805
  """Ask an LLM (OpenRouter) to identify headers in the document.
806
  Returns a list of dicts: {text, page, suggested_level, confidence}.
807
  The function sends plain page-line strings to the LLM (including page numbers)
@@ -813,7 +813,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
813
  logger.info(f"Model: {model}")
814
  # logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
815
 
816
- # doc = openPDF(pdf_path)
817
  api_key = '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'
818
  if api_key is None:
819
  api_key = os.getenv("OPENROUTER_API_KEY") or None
@@ -1105,7 +1105,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
1105
  # # Return None or a custom error message to Gradio
1106
  # return None
1107
 
1108
- def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model):
1109
  logger.debug(f"Starting function")
1110
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1111
  filenames=[]
@@ -1152,7 +1152,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model)
1152
  # doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1153
  # )
1154
  logger.info(f"Starting model run.")
1155
- identified_headers = identify_headers_with_openrouterNEWW(doc, model)
1156
  allheaders_LLM=[]
1157
  for h in identified_headers:
1158
  if int(h["page"]) in toc_pages:
@@ -1612,10 +1612,10 @@ def build_subject_body_map(jsons):
1612
 
1613
  return subject_body
1614
 
1615
- def identify_headers_and_save_excel(pdf_path, model):
1616
  try:
1617
- # result = identify_headers_with_openrouterNEWW(pdf_path, model)
1618
- jsons,result = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model)
1619
  print(jsons)
1620
  if not result:
1621
  df = pd.DataFrame([{
 
801
  # return out
802
 
803
 
804
+ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
805
  """Ask an LLM (OpenRouter) to identify headers in the document.
806
  Returns a list of dicts: {text, page, suggested_level, confidence}.
807
  The function sends plain page-line strings to the LLM (including page numbers)
 
813
  logger.info(f"Model: {model}")
814
  # logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
815
 
816
+ doc = openPDF(pdf_path)
817
  api_key = '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'
818
  if api_key is None:
819
  api_key = os.getenv("OPENROUTER_API_KEY") or None
 
1105
  # # Return None or a custom error message to Gradio
1106
  # return None
1107
 
1108
+ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
1109
  logger.debug(f"Starting function")
1110
  # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
1111
  filenames=[]
 
1152
  # doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
1153
  # )
1154
  logger.info(f"Starting model run.")
1155
+ # identified_headers = identify_headers_with_openrouterNEWW(doc, model)
1156
  allheaders_LLM=[]
1157
  for h in identified_headers:
1158
  if int(h["page"]) in toc_pages:
 
1612
 
1613
  return subject_body
1614
 
1615
+ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
1616
  try:
1617
+ result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
1618
+ jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
1619
  print(jsons)
1620
  if not result:
1621
  df = pd.DataFrame([{