Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

Marthee commited on 5 days ago

Commit

7f5d965

verified ·

1 Parent(s): b13a7a5

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -49

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import requests
 from io import BytesIO
 from datetime import datetime
 import pandas as pd
 from io import BytesIO
 import fitz  # PyMuPDF
@@ -492,6 +493,20 @@ def get_toc_page_numbers(doc, max_pages_to_check=15):
     logger.info("No TOC pages found")
     return [] # Return empty list if nothing found
 def openPDF(pdf_path):
     logger.info(f"Opening PDF from URL: {pdf_path}")
@@ -1838,7 +1853,7 @@ def testFunction(pdf_path, model,LLM_prompt):
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
         paths=heading_to_searchDict['path']
-        xloc=heading_to_searchDict['x']
         yloc=heading_to_searchDict['y']
         # Initialize variables
@@ -2285,14 +2300,14 @@ def testFunction(pdf_path, model,LLM_prompt):
             # If they match or the subject is inside the first line, remove it
             if subject in first_line or first_line in subject:
-                entry["BodyText"] = entry["BodyText"][1:]
-    # jsons.append(data_list_JSON)
     json_output = json.dumps(data_list_JSON, indent=4)
-    logger.info(f"Markups done!")
     logger.info(f"Uploaded and Readyy!")
     return json_output,identified_headers
@@ -2310,12 +2325,77 @@ def build_subject_body_map(jsons):
     return subject_body
 def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
     try:
-        # result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
-        print('beginnging identify')
         jsons,result = testFunction(pdf_path, model,LLM_prompt)
-        print('done , will start dataframe',jsons,result)
         if not result:
             df = pd.DataFrame([{
                 "text": None,
@@ -2326,54 +2406,62 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
                 "System Message": "No headers were identified by the LLM."
             }])
         else:
             df = pd.DataFrame(result)
             subject_body_map = {}
-            # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
-            for pdf_level in jsons:
-                if not isinstance(pdf_level, list):
-                    continue
-                for section_level in pdf_level:
-                    # If the LLM returns a list of dictionaries here
-                    if isinstance(section_level, list):
-                        for obj in section_level:
-                            if isinstance(obj, dict):
-                                subject = obj.get("Subject")
-                                body = obj.get("BodyText", [])
-                                if subject:
-                                    # Ensure body is a list before joining
-                                    body_str = " ".join(body) if isinstance(body, list) else str(body)
-                                    subject_body_map[subject.strip()] = body_str
-                    # If the LLM returns a single dictionary here
-                    elif isinstance(section_level, dict):
-                        subject = section_level.get("Subject")
-                        body = section_level.get("BodyText", [])
-                        if subject:
-                            body_str = " ".join(body) if isinstance(body, list) else str(body)
-                            subject_body_map[subject.strip()] = body_str
-            # Map the extracted body text to the "text" column in your main DataFrame
-            if "text" in df.columns:
-                df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
-            else:
-                df["body"] = None
-        # Save to Excel
-        output_path = os.path.abspath("header_analysis_output.xlsx")
-        df.to_excel(output_path, index=False, engine="openpyxl")
-        print("--- Processed DataFrame ---")
-        print(df)
-        return output_path
     except Exception as e:
-        print(f"ERROR - Critical error in processing: {e}")
-        # Re-raise or handle as needed
         return None
 # Improved launch with debug mode enabled
 iface = gr.Interface(
     fn=identify_headers_and_save_excel,

 import requests
 from io import BytesIO
 from datetime import datetime
+from difflib import SequenceMatcher
 import pandas as pd
 from io import BytesIO
 import fitz  # PyMuPDF
     logger.info("No TOC pages found")
     return [] # Return empty list if nothing found
+def is_header(span, most_common_font_size, most_common_color, most_common_font,allheadersLLM):
+    fontname = span.get("font", "").lower()
+    # is_italic = "italic" in fontname or "oblique" in fontname
+    isheader=False
+    is_bold = "bold" in fontname or span.get("bold", False)
+    if span['text'] in allheadersLLM:
+        isheader=True
+    return (
+        (
+            span["size"] > most_common_font_size or
+            span["font"].lower() != most_common_font.lower() or
+            (isheader and span["size"] > most_common_font_size )
+        )
+    )
 def openPDF(pdf_path):
     logger.info(f"Opening PDF from URL: {pdf_path}")
         heading_to_search = heading_to_searchDict['text']
         heading_to_searchPageNum = heading_to_searchDict['page']
         paths=heading_to_searchDict['path']
+        # xloc=heading_to_searchDict['x']
         yloc=heading_to_searchDict['y']
         # Initialize variables
             # If they match or the subject is inside the first line, remove it
             if subject in first_line or first_line in subject:
+                entry["BodyText"] = entry["BodyText"][1:]
+    print('data_list_JSON',data_list_JSON)
+    # json_output.append(data_list_JSON)
     json_output = json.dumps(data_list_JSON, indent=4)
+    logger.info(f"Markups done! Uploading to dropbox")
     logger.info(f"Uploaded and Readyy!")
     return json_output,identified_headers
     return subject_body
+# def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
+#     try:
+#         # result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
+#         print('beginnging identify')
+#         jsons,result = testFunction(pdf_path, model,LLM_prompt)
+#         print('done , will start dataframe',jsons,result)
+#         if not result:
+#             df = pd.DataFrame([{
+#                 "text": None,
+#                 "page": None,
+#                 "suggested_level": None,
+#                 "confidence": None,
+#                 "body": None,
+#                 "System Message": "No headers were identified by the LLM."
+#             }])
+#         else:
+#             df = pd.DataFrame(result)
+#             subject_body_map = {}
+#             # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
+#             for pdf_level in jsons:
+#                 if not isinstance(pdf_level, list):
+#                     continue
+#                 for section_level in pdf_level:
+#                     # If the LLM returns a list of dictionaries here
+#                     if isinstance(section_level, list):
+#                         for obj in section_level:
+#                             if isinstance(obj, dict):
+#                                 subject = obj.get("Subject")
+#                                 body = obj.get("BodyText", [])
+#                                 if subject:
+#                                     # Ensure body is a list before joining
+#                                     body_str = " ".join(body) if isinstance(body, list) else str(body)
+#                                     subject_body_map[subject.strip()] = body_str
+#                     # If the LLM returns a single dictionary here
+#                     elif isinstance(section_level, dict):
+#                         subject = section_level.get("Subject")
+#                         body = section_level.get("BodyText", [])
+#                         if subject:
+#                             body_str = " ".join(body) if isinstance(body, list) else str(body)
+#                             subject_body_map[subject.strip()] = body_str
+#             # Map the extracted body text to the "text" column in your main DataFrame
+#             if "text" in df.columns:
+#                 df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
+#             else:
+#                 df["body"] = None
+#         # Save to Excel
+#         output_path = os.path.abspath("header_analysis_output.xlsx")
+#         df.to_excel(output_path, index=False, engine="openpyxl")
+#         print("--- Processed DataFrame ---")
+#         print(df)
+#         return output_path
+#     except Exception as e:
+#         print(f"ERROR - Critical error in processing: {e}")
+#         # Re-raise or handle as needed
+#         return None
 def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
     try:
+        # result = identify_headers_with_openrouterNEWW(pdf_path, model)
         jsons,result = testFunction(pdf_path, model,LLM_prompt)
+        print('jsonssss',jsons)
         if not result:
             df = pd.DataFrame([{
                 "text": None,
                 "System Message": "No headers were identified by the LLM."
             }])
         else:
+            print('here')
             df = pd.DataFrame(result)
+            # subject_body_map = {}
+            # for pdf_sections in jsons:
+            #     for obj in pdf_sections:
+            #         subject = obj.get("Subject")
+            #         body = obj.get("BodyText", [])
+            #         if subject:
+            #             subject_body_map[subject.strip()] = " ".join(body)
+            # df["body"] = df["text"].map(subject_body_map)
             subject_body_map = {}
+            def process_obj(obj):
+                if not isinstance(obj, dict):
+                    return
+                subject = obj.get("Subject")
+                body = obj.get("BodyText", [])
+                if subject:
+                    if isinstance(body, list):
+                        body_text = " ".join(body)
+                    else:
+                        body_text = str(body)
+                    subject_body_map[subject.strip()] = body_text
+            for item in jsons:
+                # Case: flat list of dicts (like your example)
+                if isinstance(item, dict):
+                    process_obj(item)
+                # Case: nested list of dicts
+                elif isinstance(item, list):
+                    for obj in item:
+                        process_obj(obj)
+                    output_path = os.path.abspath("header_analysis_output.xlsx")
+                    df.to_excel(output_path, index=False, engine="openpyxl")
+                    print("--- Processed DataFrame ---")
+                    print(df)
+                    return output_path
     except Exception as e:
+        logger.error(f"Critical error in processing: {str(e)}")
         return None
 # Improved launch with debug mode enabled
 iface = gr.Interface(
     fn=identify_headers_and_save_excel,