Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -801,7 +801,7 @@ def openPDF(pdf_path):
|
|
| 801 |
# return out
|
| 802 |
|
| 803 |
|
| 804 |
-
def identify_headers_with_openrouterNEWW(
|
| 805 |
"""Ask an LLM (OpenRouter) to identify headers in the document.
|
| 806 |
Returns a list of dicts: {text, page, suggested_level, confidence}.
|
| 807 |
The function sends plain page-line strings to the LLM (including page numbers)
|
|
@@ -813,7 +813,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
|
|
| 813 |
logger.info(f"Model: {model}")
|
| 814 |
# logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
|
| 815 |
|
| 816 |
-
|
| 817 |
api_key = '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'
|
| 818 |
if api_key is None:
|
| 819 |
api_key = os.getenv("OPENROUTER_API_KEY") or None
|
|
@@ -1105,7 +1105,7 @@ def identify_headers_with_openrouterNEWW(doc, model,LLM_prompt, pages_to_check=N
|
|
| 1105 |
# # Return None or a custom error message to Gradio
|
| 1106 |
# return None
|
| 1107 |
|
| 1108 |
-
def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model):
|
| 1109 |
logger.debug(f"Starting function")
|
| 1110 |
# keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
|
| 1111 |
filenames=[]
|
|
@@ -1152,7 +1152,7 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model)
|
|
| 1152 |
# doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
|
| 1153 |
# )
|
| 1154 |
logger.info(f"Starting model run.")
|
| 1155 |
-
identified_headers = identify_headers_with_openrouterNEWW(doc, model)
|
| 1156 |
allheaders_LLM=[]
|
| 1157 |
for h in identified_headers:
|
| 1158 |
if int(h["page"]) in toc_pages:
|
|
@@ -1612,10 +1612,10 @@ def build_subject_body_map(jsons):
|
|
| 1612 |
|
| 1613 |
return subject_body
|
| 1614 |
|
| 1615 |
-
def identify_headers_and_save_excel(pdf_path, model):
|
| 1616 |
try:
|
| 1617 |
-
|
| 1618 |
-
jsons
|
| 1619 |
print(jsons)
|
| 1620 |
if not result:
|
| 1621 |
df = pd.DataFrame([{
|
|
|
|
| 801 |
# return out
|
| 802 |
|
| 803 |
|
| 804 |
+
def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
|
| 805 |
"""Ask an LLM (OpenRouter) to identify headers in the document.
|
| 806 |
Returns a list of dicts: {text, page, suggested_level, confidence}.
|
| 807 |
The function sends plain page-line strings to the LLM (including page numbers)
|
|
|
|
| 813 |
logger.info(f"Model: {model}")
|
| 814 |
# logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
|
| 815 |
|
| 816 |
+
doc = openPDF(pdf_path)
|
| 817 |
api_key = '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'
|
| 818 |
if api_key is None:
|
| 819 |
api_key = os.getenv("OPENROUTER_API_KEY") or None
|
|
|
|
| 1105 |
# # Return None or a custom error message to Gradio
|
| 1106 |
# return None
|
| 1107 |
|
| 1108 |
+
def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
|
| 1109 |
logger.debug(f"Starting function")
|
| 1110 |
# keywordstoSkip=["installation", "execution", "miscellaneous items", "workmanship", "testing", "labeling"]
|
| 1111 |
filenames=[]
|
|
|
|
| 1152 |
# doc, toc_pages, most_common_font_size, most_common_color, most_common_font, top_margin, bottom_margin
|
| 1153 |
# )
|
| 1154 |
logger.info(f"Starting model run.")
|
| 1155 |
+
# identified_headers = identify_headers_with_openrouterNEWW(doc, model)
|
| 1156 |
allheaders_LLM=[]
|
| 1157 |
for h in identified_headers:
|
| 1158 |
if int(h["page"]) in toc_pages:
|
|
|
|
| 1612 |
|
| 1613 |
return subject_body
|
| 1614 |
|
| 1615 |
+
def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 1616 |
try:
|
| 1617 |
+
result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
|
| 1618 |
+
jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
|
| 1619 |
print(jsons)
|
| 1620 |
if not result:
|
| 1621 |
df = pd.DataFrame([{
|