Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -97,157 +97,127 @@ def clean_llm_response(response: str) -> str:
|
|
| 97 |
import logging
|
| 98 |
from bs4 import BeautifulSoup
|
| 99 |
|
| 100 |
-
def expand_snippet_area(
|
| 101 |
"""
|
| 102 |
-
Given
|
| 103 |
It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
|
| 104 |
-
until the highest level iframe is reached or (if no iframe is present) until a div or table is
|
| 105 |
-
encountered—the first allowed container (div or table) found is used. If neither
|
| 106 |
-
|
| 107 |
|
| 108 |
-
Logging is
|
| 109 |
"""
|
| 110 |
allowed_tags = {"div", "table"}
|
| 111 |
-
|
| 112 |
-
logging.info("
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
logging.info(f"Searching for all elements containing the snippet: '{snippet}'")
|
| 116 |
-
# Get all tags where the snippet is contained in the aggregated text.
|
| 117 |
candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
|
| 118 |
if not candidates:
|
| 119 |
-
logging.info("No element containing the snippet was found. Returning
|
| 120 |
-
return
|
| 121 |
|
| 122 |
-
# Choose the candidate with the greatest depth (i.e.
|
| 123 |
-
# This gives us the smallest container containing the snippet.
|
| 124 |
candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
|
| 125 |
-
logging.info(
|
| 126 |
|
| 127 |
iframe_candidate = None
|
| 128 |
allowed_candidate = None
|
| 129 |
|
| 130 |
-
# Iterate upward from the candidate's
|
| 131 |
current = candidate.parent
|
| 132 |
while current is not None and current.name.lower() != "body":
|
| 133 |
-
logging.info(
|
| 134 |
tag_name = current.name.lower()
|
| 135 |
if tag_name == "iframe":
|
| 136 |
iframe_candidate = current
|
| 137 |
logging.info("Found an <iframe> container; updating iframe_candidate.")
|
| 138 |
elif tag_name in allowed_tags and allowed_candidate is None:
|
| 139 |
allowed_candidate = current
|
| 140 |
-
logging.info(
|
| 141 |
current = current.parent
|
| 142 |
|
| 143 |
-
# Decision on which container to return based on the priority:
|
| 144 |
if iframe_candidate is not None:
|
| 145 |
-
logging.info("Returning
|
| 146 |
-
return
|
| 147 |
elif allowed_candidate is not None:
|
| 148 |
-
logging.info("No iframe found; returning
|
| 149 |
-
return
|
| 150 |
else:
|
| 151 |
-
logging.info("No iframe, div, or table container found; returning candidate element
|
| 152 |
-
return
|
| 153 |
|
| 154 |
|
|
|
|
| 155 |
def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
|
| 156 |
initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
|
| 157 |
complementary_guidance: str) -> (str, str):
|
| 158 |
"""
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area)
|
| 166 |
-
to search for that text exactly and select the outer container (<div>, <table>, or <iframe>).
|
| 167 |
-
3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
|
| 168 |
-
and the user adjustment request, and outputs a corrected version.
|
| 169 |
-
4. The code then replaces the original container with the updated version in the BeautifulSoup object.
|
| 170 |
-
5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
|
| 171 |
-
the reference table.
|
| 172 |
-
6. A summary of all corrections is appended to the QA log.
|
| 173 |
"""
|
|
|
|
|
|
|
|
|
|
| 174 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 175 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 176 |
|
| 177 |
logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
|
| 178 |
|
| 179 |
-
# Step 1:
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
// Examples
|
| 187 |
-
1) if the user request to "Add xyz in the conclusion", the unique string to identify should be specific to the conclusion
|
| 188 |
-
2) if the user request to "correct the graph after section 1.2", the unique string should be one of the string that appear specifically in the graph after section 1.2 (ex: the title)
|
| 189 |
-
3) if the user request is "Remove any mention about the car industry", the unique string(s) should be a sentence that would be in a paragraph of the report that would talk about car industry
|
| 190 |
-
--> The unique string is what would allow to identify precisely through a search the section targeted by the user request, it has to be concise and unique.
|
| 191 |
-
|
| 192 |
-
Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
|
| 193 |
-
Ensure these strings exactly match the content in the report.
|
| 194 |
-
|
| 195 |
-
Full Report HTML:
|
| 196 |
-
{report_html}
|
| 197 |
-
|
| 198 |
-
User Adjustment Request:
|
| 199 |
-
{adjustment_request}
|
| 200 |
-
|
| 201 |
Only output valid JSON."""
|
| 202 |
)
|
| 203 |
-
|
| 204 |
response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
|
| 205 |
-
logging.info(
|
| 206 |
-
|
| 207 |
try:
|
| 208 |
response_identify = clean_llm_response(response_identify.strip().strip("```"))
|
| 209 |
id_data = json.loads(response_identify)
|
| 210 |
unique_strings = id_data.get("identified_unique_strings", [])
|
| 211 |
except Exception as e:
|
| 212 |
-
logging.error(
|
| 213 |
unique_strings = []
|
| 214 |
|
| 215 |
if not unique_strings:
|
| 216 |
logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
|
| 217 |
return report_html, qa
|
| 218 |
-
|
| 219 |
-
# Step 2:
|
| 220 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 221 |
corrections_summary = []
|
|
|
|
| 222 |
for uniq_str in unique_strings:
|
| 223 |
uniq_str = uniq_str.strip()
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
candidate = soup.find(lambda tag: container_html in str(tag))
|
| 230 |
-
if not candidate:
|
| 231 |
-
logging.warning(f"fine_tune_report: The container for unique string was not found: {uniq_str}")
|
| 232 |
continue
|
| 233 |
-
|
| 234 |
-
original_container_html = str(
|
| 235 |
-
logging.info(
|
| 236 |
|
| 237 |
# Step 3: Call the LLM to adjust this container.
|
| 238 |
-
prompt_adjust = (
|
| 239 |
-
You are a technical editor.
|
| 240 |
Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
|
| 241 |
produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
|
| 242 |
-
The updated version will put back
|
| 243 |
|
| 244 |
-
// Context
|
| 245 |
- Overall Report HTML:
|
| 246 |
{report_html}
|
| 247 |
-
- Knowledge Crumbs
|
| 248 |
{knowledge_crumbs}
|
| 249 |
|
| 250 |
-
// Request
|
| 251 |
- Original Container to Adjust:
|
| 252 |
{original_container_html}
|
| 253 |
|
|
@@ -259,7 +229,8 @@ Additional Guidance:
|
|
| 259 |
- Complementary Guidance:
|
| 260 |
{complementary_guidance}
|
| 261 |
|
| 262 |
-
Ensure the updated content remains consistent with the overall report style.
|
|
|
|
| 263 |
- "improved" (the corrected container's full HTML) and
|
| 264 |
- "summary" (a brief explanation of the changes)
|
| 265 |
|
|
@@ -267,30 +238,31 @@ Only output valid JSON."""
|
|
| 267 |
)
|
| 268 |
|
| 269 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
| 270 |
-
logging.info(
|
| 271 |
try:
|
| 272 |
response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
|
| 273 |
-
logging.info(
|
| 274 |
adjust_data = json.loads(response_adjust)
|
| 275 |
corrected_container = adjust_data.get("improved", "").strip()
|
| 276 |
container_summary = adjust_data.get("summary", "").strip()
|
| 277 |
except Exception as e:
|
| 278 |
-
logging.error(
|
| 279 |
continue
|
| 280 |
-
|
| 281 |
if not corrected_container:
|
| 282 |
-
logging.warning("fine_tune_report: No improved container was generated; skipping.")
|
| 283 |
continue
|
| 284 |
-
|
| 285 |
corrections_summary.append(f"Container corrected: {container_summary}")
|
| 286 |
|
| 287 |
-
# Step 4: Replace the original container with the updated one.
|
| 288 |
-
|
| 289 |
logging.info("fine_tune_report: Updated container re-injected.")
|
| 290 |
|
| 291 |
updated_report_html = str(soup)
|
| 292 |
|
| 293 |
-
# Step 5
|
|
|
|
| 294 |
prompt_refs = (
|
| 295 |
f"You are a technical editor. Review the following updated report HTML. "
|
| 296 |
f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
|
|
@@ -311,16 +283,15 @@ Only output valid JSON."""
|
|
| 311 |
next_sibling.replace_with(new_ref_html)
|
| 312 |
logging.info("fine_tune_report: Reference table updated successfully.")
|
| 313 |
except Exception as e:
|
| 314 |
-
logging.error(
|
| 315 |
else:
|
| 316 |
-
logging.info("fine_tune_report: No sibling after reference heading; skipping update.")
|
| 317 |
updated_report_html = str(soup_updated)
|
| 318 |
else:
|
| 319 |
logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
|
| 320 |
else:
|
| 321 |
logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
|
| 322 |
|
| 323 |
-
# Step 6: Append a summary of corrections to the existing QA log.
|
| 324 |
global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
|
| 325 |
updated_qa = qa.strip() + "\n----------\n" + global_summary
|
| 326 |
|
|
|
|
| 97 |
import logging
|
| 98 |
from bs4 import BeautifulSoup
|
| 99 |
|
| 100 |
+
def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
|
| 101 |
"""
|
| 102 |
+
Given a BeautifulSoup object and a snippet of text, this function finds the element that contains the snippet.
|
| 103 |
It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
|
| 104 |
+
until the highest level <iframe> is reached or (if no <iframe> is present) until a <div> or <table> is
|
| 105 |
+
encountered—the first allowed container (<div> or <table>) found is used. If neither is found,
|
| 106 |
+
it returns the candidate element itself.
|
| 107 |
|
| 108 |
+
Logging is provided at each key step.
|
| 109 |
"""
|
| 110 |
allowed_tags = {"div", "table"}
|
| 111 |
+
|
| 112 |
+
logging.info("Searching for all elements containing the snippet: '%s'", snippet)
|
| 113 |
+
# Get all tags where the snippet is in the text.
|
|
|
|
|
|
|
|
|
|
| 114 |
candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
|
| 115 |
if not candidates:
|
| 116 |
+
logging.info("No element containing the snippet was found. Returning None.")
|
| 117 |
+
return None
|
| 118 |
|
| 119 |
+
# Choose the candidate with the greatest depth (i.e. most ancestors).
|
|
|
|
| 120 |
candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
|
| 121 |
+
logging.info("Candidate element selected based on depth (<%s>): %s", candidate.name, candidate)
|
| 122 |
|
| 123 |
iframe_candidate = None
|
| 124 |
allowed_candidate = None
|
| 125 |
|
| 126 |
+
# Iterate upward from the candidate's parent.
|
| 127 |
current = candidate.parent
|
| 128 |
while current is not None and current.name.lower() != "body":
|
| 129 |
+
logging.info("Evaluating parent element: <%s>", current.name)
|
| 130 |
tag_name = current.name.lower()
|
| 131 |
if tag_name == "iframe":
|
| 132 |
iframe_candidate = current
|
| 133 |
logging.info("Found an <iframe> container; updating iframe_candidate.")
|
| 134 |
elif tag_name in allowed_tags and allowed_candidate is None:
|
| 135 |
allowed_candidate = current
|
| 136 |
+
logging.info("Found allowed container <%s>; setting allowed_candidate.", tag_name)
|
| 137 |
current = current.parent
|
| 138 |
|
|
|
|
| 139 |
if iframe_candidate is not None:
|
| 140 |
+
logging.info("Returning the iframe container.")
|
| 141 |
+
return iframe_candidate
|
| 142 |
elif allowed_candidate is not None:
|
| 143 |
+
logging.info("No iframe found; returning the first allowed container (div/table).")
|
| 144 |
+
return allowed_candidate
|
| 145 |
else:
|
| 146 |
+
logging.info("No iframe, div, or table container found; returning candidate element.")
|
| 147 |
+
return candidate
|
| 148 |
|
| 149 |
|
| 150 |
+
# In fine_tune_report, use the same soup instance:
|
| 151 |
def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
|
| 152 |
initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
|
| 153 |
complementary_guidance: str) -> (str, str):
|
| 154 |
"""
|
| 155 |
+
...
|
| 156 |
+
The function fine-tunes the report by:
|
| 157 |
+
1. Identifying unique strings in the area to adjust.
|
| 158 |
+
2. Using expand_snippet_area (which now receives a BeautifulSoup object) to locate the container.
|
| 159 |
+
3. Calling an LLM to produce an improved container and then replacing the original.
|
| 160 |
+
4. Optionally updating the reference table and appending a summary.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
"""
|
| 162 |
+
import os
|
| 163 |
+
import json
|
| 164 |
+
|
| 165 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 166 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 167 |
|
| 168 |
logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
|
| 169 |
|
| 170 |
+
# Step 1: (LLM call to get unique strings) ...
|
| 171 |
+
# [Assume this part remains unchanged and unique_strings is obtained]
|
| 172 |
+
|
| 173 |
+
prompt_identify = (
|
| 174 |
+
f"""You are a meticulous technical editor.
|
| 175 |
+
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
Only output valid JSON."""
|
| 177 |
)
|
|
|
|
| 178 |
response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
|
| 179 |
+
logging.info("fine_tune_report: Raw unique string identification response: %s", response_identify)
|
| 180 |
+
|
| 181 |
try:
|
| 182 |
response_identify = clean_llm_response(response_identify.strip().strip("```"))
|
| 183 |
id_data = json.loads(response_identify)
|
| 184 |
unique_strings = id_data.get("identified_unique_strings", [])
|
| 185 |
except Exception as e:
|
| 186 |
+
logging.error("fine_tune_report: Error parsing unique strings JSON: %s", e)
|
| 187 |
unique_strings = []
|
| 188 |
|
| 189 |
if not unique_strings:
|
| 190 |
logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
|
| 191 |
return report_html, qa
|
| 192 |
+
|
| 193 |
+
# Step 2: Parse the report HTML once.
|
| 194 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 195 |
corrections_summary = []
|
| 196 |
+
|
| 197 |
for uniq_str in unique_strings:
|
| 198 |
uniq_str = uniq_str.strip()
|
| 199 |
+
logging.info("fine_tune_report: Processing unique string: '%s'", uniq_str)
|
| 200 |
+
# Use expand_snippet_area to get the container Tag directly.
|
| 201 |
+
container_tag = expand_snippet_area(soup, uniq_str)
|
| 202 |
+
if container_tag is None:
|
| 203 |
+
logging.warning("fine_tune_report: Could not locate a container for unique string: '%s'", uniq_str)
|
|
|
|
|
|
|
|
|
|
| 204 |
continue
|
| 205 |
+
|
| 206 |
+
original_container_html = str(container_tag)
|
| 207 |
+
logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
|
| 208 |
|
| 209 |
# Step 3: Call the LLM to adjust this container.
|
| 210 |
+
prompt_adjust = (
|
| 211 |
+
f"""You are a technical editor.
|
| 212 |
Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
|
| 213 |
produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
|
| 214 |
+
The updated version will be put back in the exact same location and must have the same outer tags.
|
| 215 |
|
|
|
|
| 216 |
- Overall Report HTML:
|
| 217 |
{report_html}
|
| 218 |
+
- Knowledge Crumbs:
|
| 219 |
{knowledge_crumbs}
|
| 220 |
|
|
|
|
| 221 |
- Original Container to Adjust:
|
| 222 |
{original_container_html}
|
| 223 |
|
|
|
|
| 229 |
- Complementary Guidance:
|
| 230 |
{complementary_guidance}
|
| 231 |
|
| 232 |
+
Ensure the updated content remains consistent with the overall report style.
|
| 233 |
+
Output a JSON object with exactly two keys:
|
| 234 |
- "improved" (the corrected container's full HTML) and
|
| 235 |
- "summary" (a brief explanation of the changes)
|
| 236 |
|
|
|
|
| 238 |
)
|
| 239 |
|
| 240 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
| 241 |
+
logging.info("fine_tune_report: Raw container adjustment response: %s", response_adjust)
|
| 242 |
try:
|
| 243 |
response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
|
| 244 |
+
logging.info("Cleaned container adjustment response: %s", response_adjust)
|
| 245 |
adjust_data = json.loads(response_adjust)
|
| 246 |
corrected_container = adjust_data.get("improved", "").strip()
|
| 247 |
container_summary = adjust_data.get("summary", "").strip()
|
| 248 |
except Exception as e:
|
| 249 |
+
logging.error("fine_tune_report: Error parsing container adjustment JSON: %s", e)
|
| 250 |
continue
|
| 251 |
+
|
| 252 |
if not corrected_container:
|
| 253 |
+
logging.warning("fine_tune_report: No improved container was generated; skipping correction for this container.")
|
| 254 |
continue
|
| 255 |
+
|
| 256 |
corrections_summary.append(f"Container corrected: {container_summary}")
|
| 257 |
|
| 258 |
+
# Step 4: Replace the original container with the updated one in our soup.
|
| 259 |
+
container_tag.replace_with(BeautifulSoup(corrected_container, "html.parser"))
|
| 260 |
logging.info("fine_tune_report: Updated container re-injected.")
|
| 261 |
|
| 262 |
updated_report_html = str(soup)
|
| 263 |
|
| 264 |
+
# (Step 5 and Step 6 remain as before to update the reference table and the QA log)
|
| 265 |
+
|
| 266 |
prompt_refs = (
|
| 267 |
f"You are a technical editor. Review the following updated report HTML. "
|
| 268 |
f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
|
|
|
|
| 283 |
next_sibling.replace_with(new_ref_html)
|
| 284 |
logging.info("fine_tune_report: Reference table updated successfully.")
|
| 285 |
except Exception as e:
|
| 286 |
+
logging.error("fine_tune_report: Error updating reference table: %s", e)
|
| 287 |
else:
|
| 288 |
+
logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
|
| 289 |
updated_report_html = str(soup_updated)
|
| 290 |
else:
|
| 291 |
logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
|
| 292 |
else:
|
| 293 |
logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
|
| 294 |
|
|
|
|
| 295 |
global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
|
| 296 |
updated_qa = qa.strip() + "\n----------\n" + global_summary
|
| 297 |
|