Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

Marthee commited on 17 days ago

Commit

4dbb8be

verified ·

1 Parent(s): 8c4ca9e

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -801,7 +801,7 @@ def openPDF(pdf_path):
 #     return out
-def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
     Returns a list of dicts: {text, page, suggested_level, confidence}.
     The function sends plain page-line strings to the LLM (including page numbers)
@@ -813,7 +813,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
     logger.info(f"Model: {model}")
     # logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
-    # doc = openPDF(pdf_path)
     api_key = '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'
     if api_key is None:
         api_key = os.getenv("OPENROUTER_API_KEY") or None
@@ -1105,7 +1105,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
 #         # Return None or a custom error message to Gradio
 #         return None
-def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model):
     logger.debug(f"Starting function")
     # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
     filenames=[]
@@ -1152,7 +1152,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model)
         #     doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
         # )
         logger.info(f"Starting model run.")
-        identified_headers = identify_headers_with_openrouterNEWW(doc, model)
         allheaders_LLM=[]
         for h in identified_headers:
             if int(h["page"]) in toc_pages:
@@ -1612,10 +1612,10 @@ def build_subject_body_map(jsons):
     return subject_body
-def identify_headers_and_save_excel(pdf_path, model):
     try:
-        # result = identify_headers_with_openrouterNEWW(pdf_path, model)
-        jsons,result = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model)
         print(jsons)
         if not result:
             df = pd.DataFrame([{

 #     return out
+def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
     Returns a list of dicts: {text, page, suggested_level, confidence}.
     The function sends plain page-line strings to the LLM (including page numbers)
     logger.info(f"Model: {model}")
     # logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
+    doc = openPDF(pdf_path)
     api_key = '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'
     if api_key is None:
         api_key = os.getenv("OPENROUTER_API_KEY") or None
 #         # Return None or a custom error message to Gradio
 #         return None
+def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
     logger.debug(f"Starting function")
     # keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
     filenames=[]
         #     doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
         # )
         logger.info(f"Starting model run.")
+        # identified_headers = identify_headers_with_openrouterNEWW(doc, model)
         allheaders_LLM=[]
         for h in identified_headers:
             if int(h["page"]) in toc_pages:
     return subject_body
+def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
     try:
+        result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
+        jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
         print(jsons)
         if not result:
             df = pd.DataFrame([{