Guiyom commited on
Commit
a5f7d67
·
verified ·
1 Parent(s): 7241714

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -55
app.py CHANGED
@@ -451,22 +451,41 @@ def remove_text_from_html(report_html: str, text_to_remove: str) -> str:
451
  child.replace_with(new_text)
452
  return str(soup)
453
 
454
- def fix_visual_after_section(report_html: str, section_title: str, instructions: str) -> str:
 
 
 
 
 
455
  soup = BeautifulSoup(report_html, "html.parser")
456
- # Find the header (any h1-h4) that contains the section title (case-insensitive)
457
- header = soup.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and section_title.lower() in tag.get_text(strip=True).lower())
 
 
 
 
 
458
  if header:
459
- # Find the next sibling that is an iframe (assuming that is the visual)
460
- iframe = header.find_next_sibling("iframe")
461
  if iframe and iframe.has_attr("srcdoc"):
462
  current_code = iframe["srcdoc"]
463
- prompt = f"""Improve the following visual code (e.g., adjust parenthesis, simplify styling) based on the additional instructions below.
464
- Current code: {current_code}
465
- Additional instructions: {instructions}
466
- Return only the improved code (no other explanation)."""
467
- improved_code = llm_call(prompt, model="o3-mini", temperature=0, max_tokens_param=2000).strip()
468
- # Re-inject the improved code
469
- iframe["srcdoc"] = improved_code
 
 
 
 
 
 
 
 
 
470
  return str(soup)
471
 
472
  def snippet_in_tag(tag: Tag, snippet: str) -> bool:
@@ -674,38 +693,36 @@ Only output valid JSON with no comments or code fences."""
674
 
675
  updated_report_html = str(soup)
676
 
677
- # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
678
-
679
- prompt_refs = (f"""
680
- You are a technical editor.
681
-
682
- Review the following updated report HTML.
683
- If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
684
- generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
685
- Updated Report HTML:\n{updated_report_html}"""
686
- )
687
- updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
688
- updated_refs = updated_refs.strip().strip("```")
689
-
690
- if updated_refs:
691
- soup_updated = BeautifulSoup(updated_report_html, "html.parser")
692
- ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
693
- if ref_heading:
694
- next_sibling = ref_heading.find_next_sibling()
695
- if next_sibling:
696
- try:
697
- new_ref_html = BeautifulSoup(updated_refs, "html.parser")
698
- next_sibling.replace_with(new_ref_html)
699
- logging.info("fine_tune_report: Reference table updated successfully.")
700
- except Exception as e:
701
- logging.error("fine_tune_report: Error updating reference table: %s", e)
 
702
  else:
703
- logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
704
- updated_report_html = str(soup_updated)
705
  else:
706
- logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
707
- else:
708
- logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
709
 
710
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
711
  updated_qa = qa.strip() + "\n----------\n" + global_summary
@@ -754,18 +771,16 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
754
  adjustment_request = user_message.replace("@improve", "").strip()
755
 
756
  # --- CASE 1: Removal request ---
757
- # e.g., "remove the following text: [[ABC]]"
758
- text_removal_match = re.search(r"remove the following text:\s*\[\[([^\]]+)\]\]", adjustment_request, re.I)
759
- if text_removal_match:
760
- text_to_remove = text_removal_match.group(1).strip()
761
  updated_report = remove_text_from_html(report_text, text_to_remove)
762
  answer = f"Removed text: '{text_to_remove}' from the report."
763
  chat_history.append([user_message, answer])
764
  return chat_history, "", updated_report
765
 
766
- # --- CASE 2: Fix visual request ---
767
- # e.g., "fix visual after section XYZ: <extra instructions...>"
768
- visual_fix_match = re.search(r"fix visual after section\s+([^\:]+)(?::\s*(.*))?", adjustment_request, re.I)
769
  if visual_fix_match:
770
  section_name = visual_fix_match.group(1).strip()
771
  extra_instructions = visual_fix_match.group(2).strip() if visual_fix_match.group(2) else ""
@@ -774,7 +789,7 @@ def improve_report_from_chat(user_message: str, chat_history: list, report_text:
774
  chat_history.append([user_message, answer])
775
  return chat_history, "", updated_report
776
 
777
- # --- DEFAULT: Proceed with normal LLM-based improvement ---
778
  updated_report, _ = fine_tune_report(
779
  adjustment_request,
780
  os.getenv("OPENAI_API_KEY"),
@@ -938,7 +953,7 @@ You are a technical editor.
938
 
939
  Review the following expanded report HTML.
940
  If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
941
- generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
942
  Updated Report HTML:\n{updated_report_html}"""
943
  )
944
  updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
@@ -946,7 +961,7 @@ Updated Report HTML:\n{updated_report_html}"""
946
 
947
  if updated_refs:
948
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
949
- ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
950
  if ref_heading:
951
  next_sibling = ref_heading.find_next_sibling()
952
  if next_sibling:
@@ -2240,9 +2255,9 @@ The report must follow this writing style: {reportstyle}.
2240
  --------------- Citations -----------
2241
  - The report must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below - be selective, don't put it at every sentence or every paragraph.
2242
  Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
2243
- - The name of the reference table should be: "Reference Summary Table"
2244
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
2245
- - The report MUST include a reference summary table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
2246
  - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
2247
  - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
2248
  - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
@@ -2401,7 +2416,7 @@ Use the following report structure with consistency:
2401
  - Introduction
2402
  - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
2403
  - Conclusion
2404
- - References summary table
2405
  - Report ending formatting (as mentioned before)
2406
 
2407
  {{Do not add anything after - no conclusive meta comment or content}}
 
451
  child.replace_with(new_text)
452
  return str(soup)
453
 
454
+ def fix_visual_after_section(report_html: str, section_title: str, extra_instructions: str) -> str:
455
+ """
456
+ Given a report HTML (as a string), the target section name (from inside [[ ]]) and any extra instructions,
457
+ this function finds the first header (preferably an <h1>) that contains the section name (ignoring ones such as "Table of Contents"),
458
+ then finds the first <iframe> after that header, sends its srcdoc to the LLM for improvement, and reinjects the improved code.
459
+ """
460
  soup = BeautifulSoup(report_html, "html.parser")
461
+ header = None
462
+ # Look for header tags (h1–h4) that contain the target, excluding common TOC headings
463
+ for tag in soup.find_all(["h1", "h2", "h3", "h4"]):
464
+ text = tag.get_text(strip=True)
465
+ if section_title.lower() in text.lower() and "table of contents" not in text.lower():
466
+ header = tag
467
+ break
468
  if header:
469
+ # Look for the first <iframe> after that header
470
+ iframe = header.find_next("iframe")
471
  if iframe and iframe.has_attr("srcdoc"):
472
  current_code = iframe["srcdoc"]
473
+ prompt = (
474
+ f"Improve the following visual code by simplifying its formatting (e.g., adjust parenthesis, remove extra styling) "
475
+ f"based on these extra instructions: {extra_instructions}\n\n"
476
+ f"Current code:\n{current_code}\n\n"
477
+ "Return only the improved code (no extra commentary)."
478
+ )
479
+ improved_code = llm_call(prompt=prompt, model="o3-mini", temperature=0, max_tokens_param=1500).strip()
480
+ # Check that the improvement is valid and not an error message.
481
+ if improved_code and not improved_code.lower().startswith("error: empty response"):
482
+ iframe["srcdoc"] = improved_code
483
+ else:
484
+ logging.error("fix_visual_after_section: LLM returned an empty or error response.")
485
+ else:
486
+ logging.error(f"fix_visual_after_section: No iframe found after section '{section_title}'.")
487
+ else:
488
+ logging.error(f"fix_visual_after_section: Section '{section_title}' not found.")
489
  return str(soup)
490
 
491
  def snippet_in_tag(tag: Tag, snippet: str) -> bool:
 
693
 
694
  updated_report_html = str(soup)
695
 
696
+ # Step 5 (and 6): Update the reference table if needed.
697
+ prompt_refs = (
698
+ f"\nYou are a technical editor.\n\n"
699
+ "Review the following updated report HTML. If any new inline citations (e.g., [x]) have been added that are not in the original reference table,\n"
700
+ "generate an updated Reference Summary Table as valid HTML. Output only the updated table without any additional comments.\n\n"
701
+ f"Updated Report HTML:\n{updated_report_html}"
702
+ )
703
+ # Increase token limit to 1500 for this call
704
+ updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1500)
705
+ updated_refs = updated_refs.strip().strip("```").strip()
706
+
707
+ if updated_refs and not updated_refs.lower().startswith("error: empty response"):
708
+ soup_updated = BeautifulSoup(updated_report_html, "html.parser")
709
+ ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "reference summary table" in tag.get_text(strip=True).lower())
710
+ if ref_heading:
711
+ next_sibling = ref_heading.find_next_sibling()
712
+ if next_sibling:
713
+ try:
714
+ new_ref_html = BeautifulSoup(updated_refs, "html.parser")
715
+ next_sibling.replace_with(new_ref_html)
716
+ logging.info("fine_tune_report: Reference table updated successfully.")
717
+ except Exception as e:
718
+ logging.error("fine_tune_report: Error updating reference table: %s", e)
719
+ else:
720
+ logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
721
+ updated_report_html = str(soup_updated)
722
  else:
723
+ logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
 
724
  else:
725
+ logging.info("fine_tune_report: No valid updated reference table returned; leaving unchanged.")
 
 
726
 
727
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
728
  updated_qa = qa.strip() + "\n----------\n" + global_summary
 
771
  adjustment_request = user_message.replace("@improve", "").strip()
772
 
773
  # --- CASE 1: Removal request ---
774
+ removal_match = re.search(r"remove the following text:\s*\[\[([^\]]+)\]\]", adjustment_request, re.I)
775
+ if removal_match:
776
+ text_to_remove = removal_match.group(1).strip()
 
777
  updated_report = remove_text_from_html(report_text, text_to_remove)
778
  answer = f"Removed text: '{text_to_remove}' from the report."
779
  chat_history.append([user_message, answer])
780
  return chat_history, "", updated_report
781
 
782
+ # --- CASE 2: Visual fix request ---
783
+ visual_fix_match = re.search(r"fix visual after section\s+\[\[([^\]]+)\]\](?::\s*(.*))?", adjustment_request, re.I)
 
784
  if visual_fix_match:
785
  section_name = visual_fix_match.group(1).strip()
786
  extra_instructions = visual_fix_match.group(2).strip() if visual_fix_match.group(2) else ""
 
789
  chat_history.append([user_message, answer])
790
  return chat_history, "", updated_report
791
 
792
+ # --- DEFAULT: Proceed with existing LLM-based improvement (fine_tune_report) ---
793
  updated_report, _ = fine_tune_report(
794
  adjustment_request,
795
  os.getenv("OPENAI_API_KEY"),
 
953
 
954
  Review the following expanded report HTML.
955
  If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table,
956
+ generate an updated References Summary Table as valid HTML. Output only the updated reference table HTML with no explanations.
957
  Updated Report HTML:\n{updated_report_html}"""
958
  )
959
  updated_refs = llm_call(prompt=prompt_refs, model="o3-mini", temperature=0, max_tokens_param=1000)
 
961
 
962
  if updated_refs:
963
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
964
+ ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "References Summary Table" in tag.get_text())
965
  if ref_heading:
966
  next_sibling = ref_heading.find_next_sibling()
967
  if next_sibling:
 
2255
  --------------- Citations -----------
2256
  - The report must include inline citations (e.g., [1], [2], etc.) from real sources provided in the search results below - be selective, don't put it at every sentence or every paragraph.
2257
  Note: citations sources in-line need to be in this format: blablabla - Source [x] / "pdf" is not a source, provide the title or author
2258
+ - The name of the reference table should be: "References Summary Table"
2259
  - The reference table at the end containing the citations details should have 4 columns: the ref number, the title of the document, the author(s, the URL - with hyperlink)
2260
+ - The report MUST include a References Summary Table with between 10 (for a 8 page report) and 30 references (for a 40 pages report). All inline citations (e.g., [1], [2], …) present in the report and in any focus placeholders MUST have a corresponding entry in this table with its full URL.
2261
  - For the reference citations, add systematically the urls from the Learnings (no need to put them in numbered list format since we alredy have the [x] that serves as number list)
2262
  - Do not add any inline citations reference in the visual and graph placeholders descriptions belo, you can add them in focus though.
2263
  - Do not make false references / citations. It has to be grounded from the sources in the rsearch results / crumbs below (no example.com/... type references!)
 
2416
  - Introduction
2417
  - [Sections and sub-sections, depending on the size and relevant topic - including visual, graph and focus placeholders]
2418
  - Conclusion
2419
+ - References Summary Table
2420
  - Report ending formatting (as mentioned before)
2421
 
2422
  {{Do not add anything after - no conclusive meta comment or content}}