Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

rawanessam commited on 6 days ago

Commit

c2dc4a5

verified ·

1 Parent(s): cd2c25f

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -394

app.py CHANGED Viewed

@@ -484,325 +484,6 @@ def openPDF(pdf_path):
     logger.info(f"PDF opened successfully, {len(doc)} pages")
     return doc
-# def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
-#     """Ask an LLM (OpenRouter) to identify headers in the document.
-#     Returns a list of dicts: {text, page, suggested_level, confidence}.
-#     The function sends plain page-line strings to the LLM (including page numbers)
-#     and asks for a JSON array containing only header lines with suggested levels.
-#     """
-#     logger.info("=" * 80)
-#     logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
-#     logger.info(f"PDF Path: {pdf_path}")
-#     logger.info(f"Model: {model}")
-#     logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
-#     doc = openPDF(pdf_path)
-#     api_key = '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'
-#     if api_key is None:
-#         api_key = os.getenv("OPENROUTER_API_KEY") or None
-#     model = str(model)
-#     # toc_pages = get_toc_page_numbers(doc)
-#     lines_for_prompt = []
-#     pgestoRun=20
-#     # logger.info(f"TOC pages to skip: {toc_pages}")
-#     logger.info(f"Total pages in document: {pgestoRun}")
-#     # Collect text lines from pages (skip TOC pages)
-#     total_lines = 0
-#     for pno in range(len(doc)):
-#         # if pages_to_check and pno not in pages_to_check:
-#         #     continue
-#         # if pno in toc_pages:
-#         #     logger.debug(f"Skipping TOC page {pno}")
-#         #     continue
-#         page = doc.load_page(pno)
-#         page_height = page.rect.height
-#         text_dict = page.get_text("dict")
-#         lines_for_prompt = []
-#         lines_on_page = 0
-#         for block in text_dict.get("blocks", []):
-#             if block.get("type") != 0:  # text blocks only
-#                 continue
-#             for line in block.get("lines", []):
-#                 spans = line.get("spans", [])
-#                 if not spans:
-#                     continue
-#                 # Use first span to check vertical position
-#                 y0 = spans[0]["bbox"][1]
-#                 y1 = spans[0]['bbox'][3]
-#                 # if y0 < top_margin or y1 > (page_height - bottom_margin):
-#                 #     continue
-#                 text = " ".join(s.get('text','') for s in spans).strip()
-#                 if text:
-#                     # prefix with page for easier mapping back
-#                     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
-#                     lines_on_page += 1
-#         # if lines_on_page > 0:
-#         # page = doc.load_page(pno)
-#         # page_height = page.rect.height
-#         # lines_on_page = 0
-#         # text_dict = page.get_text("dict")
-#         # lines = []
-#         # y_tolerance = 0.2  # tweak if needed (1–3 usually works)
-#         # for block in page.get_text("dict").get('blocks', []):
-#         #     if block.get('type') != 0:
-#         #         continue
-#         #     for line in block.get('lines', []):
-#         #         spans = line.get('spans', [])
-#         #         if not spans:
-#         #             continue
-#         #         y0 = spans[0]['bbox'][1]
-#         #         y1 = spans[0]['bbox'][3]
-#         #         if y0 < top_margin or y1 > (page_height - bottom_margin):
-#         #             continue
-#         #         for s in spans:
-#         #             # text,font,size,flags,color
-#         #             # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
-#         #             # prefix with page for easier mapping back
-#         #             text = s["text"].strip()
-#         #             lines_for_prompt.append(f"PAGE {pno+1}: {text}")
-#         #     # if not lines_for_prompt:
-#         #     #     return []
-#         #     if text:
-#         #         # prefix with page for easier mapping back
-#         #         # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
-#         #         lines_on_page += 1
-#         if lines_on_page > 0:
-#             logger.debug(f"Page {pno}: collected {lines_on_page} lines")
-#         total_lines += lines_on_page
-#     logger.info(f"Total lines collected for LLM: {total_lines}")
-#     if not lines_for_prompt:
-#         logger.warning("No lines collected for prompt")
-#         return []
-#     # Log sample of lines
-#     logger.info("Sample lines (first 10):")
-#     for i, line in enumerate(lines_for_prompt[:10]):
-#         logger.info(f"  {i}: {line}")
-#     prompt = LLM_prompt+"\n\nLines:\n" + "\n".join(lines_for_prompt)
-#     logger.debug(f"Full prompt length: {len(prompt)} characters")
-#     # Changed: Print entire prompt, not truncated
-#     print("=" * 80)
-#     print("FULL LLM PROMPT:")
-#     print(prompt)
-#     print("=" * 80)
-#     # Also log to file
-#     # try:
-#     #     with open("full_prompt.txt", "w", encoding="utf-8") as f:
-#     #         f.write(prompt)
-#     #     logger.info("Full prompt saved to full_prompt.txt")
-#     # except Exception as e:
-#     #     logger.error(f"Could not save prompt to file: {e}")
-#     if not api_key:
-#         # No API key: return empty so caller can fallback to heuristics
-#         logger.error("No API key provided")
-#         return []
-#     url = "https://openrouter.ai/api/v1/chat/completions"
-#     # Build headers following the OpenRouter example
-#     headers = {
-#         "Authorization": f"Bearer {api_key}",
-#         "Content-Type": "application/json",
-#         "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
-#         "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
-#     }
-#     # Log request details (without exposing full API key)
-#     logger.info(f"Making request to OpenRouter with model: {model}")
-#     logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
-#     # Wrap the prompt as the example 'content' array expected by OpenRouter
-#     body = {
-#         "model": model,
-#         "messages": [
-#             {
-#                 "role": "user",
-#                 "content": [
-#                     {"type": "text", "text": prompt}
-#                 ]
-#             }
-#         ]
-#     }
-#     # Debug: log request body (truncated) and write raw response for inspection
-#     try:
-#         # Changed: Log full body (excluding prompt text which is already logged)
-#         logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
-#         # Removed timeout parameter
-#         resp = requests.post(
-#             url=url,
-#             headers=headers,
-#             data=json.dumps(body)
-#         )
-#         logger.info(f"HTTP Response Status: {resp.status_code}")
-#         resp.raise_for_status()
-#         resp_text = resp.text
-#         # Changed: Print entire response
-#         print("=" * 80)
-#         print("FULL LLM RESPONSE:")
-#         print(resp_text)
-#         print("=" * 80)
-#         logger.info(f"LLM raw response length: {len(resp_text)}")
-#         # Save raw response for offline inspection
-#         try:
-#             with open("llm_debug.json", "w", encoding="utf-8") as fh:
-#                 fh.write(resp_text)
-#             logger.info("Raw response saved to llm_debug.json")
-#         except Exception as e:
-#             logger.error(f"Warning: could not write llm_debug.json: {e}")
-#         rj = resp.json()
-#         logger.info(f"LLM parsed response type: {type(rj)}")
-#         if isinstance(rj, dict):
-#             logger.debug(f"Response keys: {list(rj.keys())}")
-#     except requests.exceptions.RequestException as e:
-#         logger.error(f"HTTP request failed: {repr(e)}")
-#         return []
-#     except Exception as e:
-#         logger.error(f"LLM call failed: {repr(e)}")
-#         return []
-#     # Extract textual reply robustly
-#     text_reply = None
-#     if isinstance(rj, dict):
-#         choices = rj.get('choices') or []
-#         logger.debug(f"Number of choices in response: {len(choices)}")
-#         if choices:
-#             for i, c in enumerate(choices):
-#                 logger.debug(f"Choice {i}: {c}")
-#             c0 = choices[0]
-#             msg = c0.get('message') or c0.get('delta') or {}
-#             content = msg.get('content')
-#             if isinstance(content, list):
-#                 logger.debug(f"Content is a list with {len(content)} items")
-#                 for idx, c in enumerate(content):
-#                     if c.get('type') == 'text' and c.get('text'):
-#                         text_reply = c.get('text')
-#                         logger.debug(f"Found text reply in content[{idx}], length: {len(text_reply)}")
-#                         break
-#             elif isinstance(content, str):
-#                 text_reply = content
-#                 logger.debug(f"Content is string, length: {len(text_reply)}")
-#             elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
-#                 text_reply = msg.get('content').get('text')
-#                 logger.debug(f"Found text in nested content dict")
-#     # Fallback extraction
-#     if not text_reply:
-#         logger.debug("Trying fallback extraction from choices")
-#         for c in rj.get('choices', []):
-#             if isinstance(c.get('text'), str):
-#                 text_reply = c.get('text')
-#                 logger.debug(f"Found text reply in choice.text, length: {len(text_reply)}")
-#                 break
-#     if not text_reply:
-#         logger.error("Could not extract text reply from response")
-#         # Changed: Print the entire response structure for debugging
-#         print("=" * 80)
-#         print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
-#         print(json.dumps(rj, indent=2))
-#         print("=" * 80)
-#         return []
-#     # Changed: Print the extracted text reply
-#     print("=" * 80)
-#     print("EXTRACTED TEXT REPLY:")
-#     print(text_reply)
-#     print("=" * 80)
-#     logger.info(f"Extracted text reply length: {len(text_reply)}")
-#     logger.debug(f"First 500 chars of reply: {text_reply[:500]}...")
-#     s = text_reply.strip()
-#     start = s.find('[')
-#     end = s.rfind(']')
-#     js = s[start:end+1] if start != -1 and end != -1 else s
-#     logger.debug(f"Looking for JSON array: start={start}, end={end}")
-#     logger.debug(f"Extracted JSON string (first 500 chars): {js[:500]}...")
-#     try:
-#         parsed = json.loads(js)
-#         logger.info(f"Successfully parsed JSON, got {len(parsed)} items")
-#     except json.JSONDecodeError as e:
-#         logger.error(f"Failed to parse JSON: {e}")
-#         logger.error(f"JSON string that failed to parse: {js[:1000]}")
-#         # Try to find any JSON-like structure
-#         try:
-#             # Try to extract any JSON array
-#             import re
-#             json_pattern = r'\[\s*\{.*?\}\s*\]'
-#             matches = re.findall(json_pattern, text_reply, re.DOTALL)
-#             if matches:
-#                 logger.info(f"Found {len(matches)} potential JSON arrays via regex")
-#                 for i, match in enumerate(matches):
-#                     try:
-#                         parsed = json.loads(match)
-#                         logger.info(f"Successfully parsed regex match {i} with {len(parsed)} items")
-#                         break
-#                     except json.JSONDecodeError as e2:
-#                         logger.debug(f"Regex match {i} also failed: {e2}")
-#                         continue
-#                 else:
-#                     logger.error("All regex matches failed to parse")
-#                     return []
-#             else:
-#                 logger.error("No JSON-like pattern found via regex")
-#                 return []
-#         except Exception as e2:
-#             logger.error(f"Regex extraction also failed: {e2}")
-#             return []
-#     # Log parsed results
-#     logger.info(f"Parsed {len(parsed)} header items:")
-#     for i, obj in enumerate(parsed[:10]):  # Log first 10 items
-#         logger.info(f"  Item {i}: {obj}")
-#     # Normalize parsed entries and return
-#     out = []
-#     for obj in parsed:
-#         t = obj.get('text')
-#         page = int(obj.get('page')) if obj.get('page') else None
-#         level = obj.get('suggested_level')
-#         conf = float(obj.get('confidence') or 0)
-#         if t and page is not None:
-#             out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
-#     logger.info(f"Returning {len(out)} valid header entries")
-#     return out
 def process_document_in_chunks(
     lengthofDoc,
@@ -874,39 +555,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
     for pno in range(start_page, end_page):
         page = doc.load_page(pno)
-    # # Collect text lines from pages (skip TOC pages)
-    # total_lines = 0
-    # for pno in range(len(doc)):
-        # if pages_to_check and pno not in pages_to_check:
-        #     continue
-        # if pno in toc_pages:
-        #     logger.debug(f"Skipping TOC page {pno}")
-        #     continue
-        # page = doc.load_page(pno)
-    #     page_height = page.rect.height
-    #     lines_on_page = 0
-    #     text_dict = page.get_text("dict")
-    #     lines = []
-    #     # y_tolerance = 0.2  # tweak if needed (1–3 usually works)
-    #     for block in text_dict["blocks"]:
-    #         if block["type"] != 0:
-    #             continue
-    #         for line in block["lines"]:
-    #             for span in line["spans"]:
-    #                 text = span["text"].strip()
-    #                 if not text:
-    #                     continue
-    #                 if text:
-    #                     # prefix with page for easier mapping back
-    #                     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
-    #                     lines_on_page += 1
-    #     if lines_on_page > 0:
-    #         logger.debug(f"Page {pno}: collected {lines_on_page} lines")
-    #     total_lines += lines_on_page
-    # logger.info(f"Total lines collected for LLM: {total_lines}")
         page_height = page.rect.height
         lines_on_page = 0
         text_dict = page.get_text("dict")
@@ -1220,32 +868,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
     logger.info(f"Returning {len(out)} valid header entries")
     return out
-# def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
-#     try:
-#         # 1. Get the result from your LLM function
-#         result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
-#         # 2. Safety Check: If LLM failed or returned nothing
-#         if not result:
-#             logger.warning("No headers found or LLM failed. Creating an empty report.")
-#             df = pd.DataFrame([{"System Message": "No headers were identified by the LLM."}])
-#         else:
-#             df = pd.DataFrame(result)
-#         # 3. Use an Absolute Path for the output
-#         # This ensures Gradio knows exactly where the file is
-#         output_path = os.path.abspath("header_analysis_output.xlsx")
-#         # 4. Save using the engine explicitly
-#         df.to_excel(output_path, index=False, engine='openpyxl')
-#         logger.info(f"File successfully saved to {output_path}")
-#         return output_path
-#     except Exception as e:
-#         logger.error(f"Critical error in processing: {str(e)}")
-#         # Return None or a custom error message to Gradio
-#         return None
 def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
     logger.debug(f"Starting function")
@@ -1504,9 +1126,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
                                     # Construct the final encoded link
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
-                                    # Correctly construct the final URL with page and zoom
-                                    # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                     # Get current date and time
                                     now = datetime.now()
@@ -1608,9 +1227,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
                                         # Construct the final encoded link
                                         encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
-                                        # Correctly construct the final URL with page and zoom
-                                        # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                         # Get current date and time
                                         now = datetime.now()
@@ -1975,9 +1591,6 @@ def testFunction(pdf_path, model,LLM_prompt):
                                 # Construct the final encoded link
                                 encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
-                                # Correctly construct the final URL with page and zoom
-                                final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                 # Get current date and time
                                 now = datetime.now()
@@ -2080,9 +1693,6 @@ def testFunction(pdf_path, model,LLM_prompt):
                                     # Construct the final encoded link
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
-                                    # Correctly construct the final URL with page and zoom
-                                    final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
                                     # Get current date and time
                                     now = datetime.now()
@@ -2160,10 +1770,7 @@ def testFunction(pdf_path, model,LLM_prompt):
                             #                 for header in allheaders_LLM
                             #             )
-                            #             # ✅ FINAL header condition
-                            #             line_is_header = text_matches_header and max_font_size > 11
                         if line_is_header:
                             header_font_size = max(span["size"] for span in spans)
                             is_probably_real_header = (

     logger.info(f"PDF opened successfully, {len(doc)} pages")
     return doc
 def process_document_in_chunks(
     lengthofDoc,
     for pno in range(start_page, end_page):
         page = doc.load_page(pno)
         page_height = page.rect.height
         lines_on_page = 0
         text_dict = page.get_text("dict")
     logger.info(f"Returning {len(out)} valid header entries")
     return out
 def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
     logger.debug(f"Starting function")
                                     # Construct the final encoded link
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                     # Get current date and time
                                     now = datetime.now()
                                         # Construct the final encoded link
                                         encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                         # Get current date and time
                                         now = datetime.now()
                                 # Construct the final encoded link
                                 encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                 # Get current date and time
                                 now = datetime.now()
                                     # Construct the final encoded link
                                     encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
                                     # Get current date and time
                                     now = datetime.now()
                             #                 for header in allheaders_LLM
                             #             )
+                            #             # ✅ FINAL header
                         if line_is_header:
                             header_font_size = max(span["size"] for span in spans)
                             is_probably_real_header = (