Guiyom commited on
Commit
5a160ac
·
verified ·
1 Parent(s): 9f7d347

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -99
app.py CHANGED
@@ -97,157 +97,127 @@ def clean_llm_response(response: str) -> str:
97
  import logging
98
  from bs4 import BeautifulSoup
99
 
100
- def expand_snippet_area(full_html: str, snippet: str) -> str:
101
  """
102
- Given the full HTML and a snippet of text, this function finds the element that contains the snippet.
103
  It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
104
- until the highest level iframe is reached or (if no iframe is present) until a div or table is
105
- encountered—the first allowed container (div or table) found is used. If neither an iframe nor
106
- an allowed container is found, it returns the candidate element itself.
107
 
108
- Logging is added at each critical step to trace the decision process.
109
  """
110
  allowed_tags = {"div", "table"}
111
-
112
- logging.info("Parsing full HTML with BeautifulSoup.")
113
- soup = BeautifulSoup(full_html, "html.parser")
114
-
115
- logging.info(f"Searching for all elements containing the snippet: '{snippet}'")
116
- # Get all tags where the snippet is contained in the aggregated text.
117
  candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
118
  if not candidates:
119
- logging.info("No element containing the snippet was found. Returning snippet.")
120
- return snippet
121
 
122
- # Choose the candidate with the greatest depth (i.e. the most ancestors)
123
- # This gives us the smallest container containing the snippet.
124
  candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
125
- logging.info(f"Candidate element selected based on depth (<{candidate.name}>): {str(candidate)}")
126
 
127
  iframe_candidate = None
128
  allowed_candidate = None
129
 
130
- # Iterate upward from the candidate's direct parent.
131
  current = candidate.parent
132
  while current is not None and current.name.lower() != "body":
133
- logging.info(f"Evaluating parent element: <{current.name}>")
134
  tag_name = current.name.lower()
135
  if tag_name == "iframe":
136
  iframe_candidate = current
137
  logging.info("Found an <iframe> container; updating iframe_candidate.")
138
  elif tag_name in allowed_tags and allowed_candidate is None:
139
  allowed_candidate = current
140
- logging.info(f"Found allowed container <{tag_name}>; setting allowed_candidate.")
141
  current = current.parent
142
 
143
- # Decision on which container to return based on the priority:
144
  if iframe_candidate is not None:
145
- logging.info("Returning outer HTML of the iframe container.")
146
- return str(iframe_candidate)
147
  elif allowed_candidate is not None:
148
- logging.info("No iframe found; returning outer HTML of the first allowed container (div/table).")
149
- return str(allowed_candidate)
150
  else:
151
- logging.info("No iframe, div, or table container found; returning candidate element's HTML.")
152
- return str(candidate)
153
 
154
 
 
155
  def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
156
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
157
  complementary_guidance: str) -> (str, str):
158
  """
159
- Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
160
-
161
- Process Overview:
162
- 1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
163
- The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
164
- that are representative of the targeted area(s) in the report.
165
- 2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area)
166
- to search for that text exactly and select the outer container (<div>, <table>, or <iframe>).
167
- 3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
168
- and the user adjustment request, and outputs a corrected version.
169
- 4. The code then replaces the original container with the updated version in the BeautifulSoup object.
170
- 5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
171
- the reference table.
172
- 6. A summary of all corrections is appended to the QA log.
173
  """
 
 
 
174
  os.environ["OPENAI_API_KEY"] = openai_api_key
175
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
176
 
177
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
178
 
179
- # Step 1: Get unique plain text string(s) (without HTML) that identify the targeted area.
180
- prompt_identify = (f"""
181
- You are a meticulous technical editor.
182
-
183
- Below is the full report HTML and a user adjustment request.
184
- Extract one or more unique plain-text string(s) (without any HTML tags or formatting) that uniquely appear in the area(s) targeted by the adjustment request.
185
-
186
- // Examples
187
- 1) if the user request to "Add xyz in the conclusion", the unique string to identify should be specific to the conclusion
188
- 2) if the user request to "correct the graph after section 1.2", the unique string should be one of the string that appear specifically in the graph after section 1.2 (ex: the title)
189
- 3) if the user request is "Remove any mention about the car industry", the unique string(s) should be a sentence that would be in a paragraph of the report that would talk about car industry
190
- --> The unique string is what would allow to identify precisely through a search the section targeted by the user request, it has to be concise and unique.
191
-
192
- Output them in a JSON object with the key "identified_unique_strings" mapped to a list of strings.
193
- Ensure these strings exactly match the content in the report.
194
-
195
- Full Report HTML:
196
- {report_html}
197
-
198
- User Adjustment Request:
199
- {adjustment_request}
200
-
201
  Only output valid JSON."""
202
  )
203
-
204
  response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
205
- logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
206
-
207
  try:
208
  response_identify = clean_llm_response(response_identify.strip().strip("```"))
209
  id_data = json.loads(response_identify)
210
  unique_strings = id_data.get("identified_unique_strings", [])
211
  except Exception as e:
212
- logging.error(f"fine_tune_report: Error parsing unique strings JSON: {e}")
213
  unique_strings = []
214
 
215
  if not unique_strings:
216
  logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
217
  return report_html, qa
218
-
219
- # Step 2: For each unique string, find the corresponding outer container.
220
  soup = BeautifulSoup(report_html, "html.parser")
221
  corrections_summary = []
 
222
  for uniq_str in unique_strings:
223
  uniq_str = uniq_str.strip()
224
- # Use expand_snippet_area to get the outer container for the unique text.
225
- container_html = expand_snippet_area(report_html, uniq_str)
226
- if not container_html:
227
- logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
228
- continue
229
- candidate = soup.find(lambda tag: container_html in str(tag))
230
- if not candidate:
231
- logging.warning(f"fine_tune_report: The container for unique string was not found: {uniq_str}")
232
  continue
233
-
234
- original_container_html = str(candidate)
235
- logging.info(f"fine_tune_report: Found container for unique string adjustment:\n\n{original_container_html}\n\n")
236
 
237
  # Step 3: Call the LLM to adjust this container.
238
- prompt_adjust = (f"""
239
- You are a technical editor.
240
  Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
241
  produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
242
- The updated version will put back precisely in the same location, the output should have the same outer tags.
243
 
244
- // Context
245
  - Overall Report HTML:
246
  {report_html}
247
- - Knowledge Crumbs you can use if relevant - collected from various search results:
248
  {knowledge_crumbs}
249
 
250
- // Request
251
  - Original Container to Adjust:
252
  {original_container_html}
253
 
@@ -259,7 +229,8 @@ Additional Guidance:
259
  - Complementary Guidance:
260
  {complementary_guidance}
261
 
262
- Ensure the updated content remains consistent with the overall report style. Output a JSON object with exactly two keys:
 
263
  - "improved" (the corrected container's full HTML) and
264
  - "summary" (a brief explanation of the changes)
265
 
@@ -267,30 +238,31 @@ Only output valid JSON."""
267
  )
268
 
269
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
270
- logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
271
  try:
272
  response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
273
- logging.info(f"Cleaned container adjustment response: {response_adjust}")
274
  adjust_data = json.loads(response_adjust)
275
  corrected_container = adjust_data.get("improved", "").strip()
276
  container_summary = adjust_data.get("summary", "").strip()
277
  except Exception as e:
278
- logging.error(f"fine_tune_report: Error parsing container adjustment JSON: {e}")
279
  continue
280
-
281
  if not corrected_container:
282
- logging.warning("fine_tune_report: No improved container was generated; skipping.")
283
  continue
284
-
285
  corrections_summary.append(f"Container corrected: {container_summary}")
286
 
287
- # Step 4: Replace the original container with the updated one.
288
- candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
289
  logging.info("fine_tune_report: Updated container re-injected.")
290
 
291
  updated_report_html = str(soup)
292
 
293
- # Step 5: (Optional) Update the reference table if needed.
 
294
  prompt_refs = (
295
  f"You are a technical editor. Review the following updated report HTML. "
296
  f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
@@ -311,16 +283,15 @@ Only output valid JSON."""
311
  next_sibling.replace_with(new_ref_html)
312
  logging.info("fine_tune_report: Reference table updated successfully.")
313
  except Exception as e:
314
- logging.error(f"fine_tune_report: Error updating reference table: {e}")
315
  else:
316
- logging.info("fine_tune_report: No sibling after reference heading; skipping update.")
317
  updated_report_html = str(soup_updated)
318
  else:
319
  logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
320
  else:
321
  logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
322
 
323
- # Step 6: Append a summary of corrections to the existing QA log.
324
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
325
  updated_qa = qa.strip() + "\n----------\n" + global_summary
326
 
 
97
  import logging
98
  from bs4 import BeautifulSoup
99
 
100
+ def expand_snippet_area(soup: BeautifulSoup, snippet: str) -> Tag:
101
  """
102
+ Given a BeautifulSoup object and a snippet of text, this function finds the element that contains the snippet.
103
  It then uses an iterative while loop to traverse upward (from the immediate parent to the top)
104
+ until the highest level <iframe> is reached or (if no <iframe> is present) until a <div> or <table> is
105
+ encountered—the first allowed container (<div> or <table>) found is used. If neither is found,
106
+ it returns the candidate element itself.
107
 
108
+ Logging is provided at each key step.
109
  """
110
  allowed_tags = {"div", "table"}
111
+
112
+ logging.info("Searching for all elements containing the snippet: '%s'", snippet)
113
+ # Get all tags where the snippet is in the text.
 
 
 
114
  candidates = soup.find_all(lambda tag: tag.get_text() and snippet in tag.get_text())
115
  if not candidates:
116
+ logging.info("No element containing the snippet was found. Returning None.")
117
+ return None
118
 
119
+ # Choose the candidate with the greatest depth (i.e. most ancestors).
 
120
  candidate = max(candidates, key=lambda tag: len(list(tag.parents)))
121
+ logging.info("Candidate element selected based on depth (<%s>): %s", candidate.name, candidate)
122
 
123
  iframe_candidate = None
124
  allowed_candidate = None
125
 
126
+ # Iterate upward from the candidate's parent.
127
  current = candidate.parent
128
  while current is not None and current.name.lower() != "body":
129
+ logging.info("Evaluating parent element: <%s>", current.name)
130
  tag_name = current.name.lower()
131
  if tag_name == "iframe":
132
  iframe_candidate = current
133
  logging.info("Found an <iframe> container; updating iframe_candidate.")
134
  elif tag_name in allowed_tags and allowed_candidate is None:
135
  allowed_candidate = current
136
+ logging.info("Found allowed container <%s>; setting allowed_candidate.", tag_name)
137
  current = current.parent
138
 
 
139
  if iframe_candidate is not None:
140
+ logging.info("Returning the iframe container.")
141
+ return iframe_candidate
142
  elif allowed_candidate is not None:
143
+ logging.info("No iframe found; returning the first allowed container (div/table).")
144
+ return allowed_candidate
145
  else:
146
+ logging.info("No iframe, div, or table container found; returning candidate element.")
147
+ return candidate
148
 
149
 
150
+ # In fine_tune_report, use the same soup instance:
151
  def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
152
  initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
153
  complementary_guidance: str) -> (str, str):
154
  """
155
+ ...
156
+ The function fine-tunes the report by:
157
+ 1. Identifying unique strings in the area to adjust.
158
+ 2. Using expand_snippet_area (which now receives a BeautifulSoup object) to locate the container.
159
+ 3. Calling an LLM to produce an improved container and then replacing the original.
160
+ 4. Optionally updating the reference table and appending a summary.
 
 
 
 
 
 
 
 
161
  """
162
+ import os
163
+ import json
164
+
165
  os.environ["OPENAI_API_KEY"] = openai_api_key
166
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
167
 
168
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
169
 
170
+ # Step 1: (LLM call to get unique strings) ...
171
+ # [Assume this part remains unchanged and unique_strings is obtained]
172
+
173
+ prompt_identify = (
174
+ f"""You are a meticulous technical editor.
175
+ ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  Only output valid JSON."""
177
  )
 
178
  response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=5000, temperature=0)
179
+ logging.info("fine_tune_report: Raw unique string identification response: %s", response_identify)
180
+
181
  try:
182
  response_identify = clean_llm_response(response_identify.strip().strip("```"))
183
  id_data = json.loads(response_identify)
184
  unique_strings = id_data.get("identified_unique_strings", [])
185
  except Exception as e:
186
+ logging.error("fine_tune_report: Error parsing unique strings JSON: %s", e)
187
  unique_strings = []
188
 
189
  if not unique_strings:
190
  logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
191
  return report_html, qa
192
+
193
+ # Step 2: Parse the report HTML once.
194
  soup = BeautifulSoup(report_html, "html.parser")
195
  corrections_summary = []
196
+
197
  for uniq_str in unique_strings:
198
  uniq_str = uniq_str.strip()
199
+ logging.info("fine_tune_report: Processing unique string: '%s'", uniq_str)
200
+ # Use expand_snippet_area to get the container Tag directly.
201
+ container_tag = expand_snippet_area(soup, uniq_str)
202
+ if container_tag is None:
203
+ logging.warning("fine_tune_report: Could not locate a container for unique string: '%s'", uniq_str)
 
 
 
204
  continue
205
+
206
+ original_container_html = str(container_tag)
207
+ logging.info("fine_tune_report: Found container for unique string adjustment:\n\n%s\n", original_container_html)
208
 
209
  # Step 3: Call the LLM to adjust this container.
210
+ prompt_adjust = (
211
+ f"""You are a technical editor.
212
  Given the following HTML container (with its outer tags) extracted from a larger report and based on the user adjustment request,
213
  produce a corrected version by making only the necessary changes. Preserve inline citations, formatting, and context.
214
+ The updated version will be put back in the exact same location and must have the same outer tags.
215
 
 
216
  - Overall Report HTML:
217
  {report_html}
218
+ - Knowledge Crumbs:
219
  {knowledge_crumbs}
220
 
 
221
  - Original Container to Adjust:
222
  {original_container_html}
223
 
 
229
  - Complementary Guidance:
230
  {complementary_guidance}
231
 
232
+ Ensure the updated content remains consistent with the overall report style.
233
+ Output a JSON object with exactly two keys:
234
  - "improved" (the corrected container's full HTML) and
235
  - "summary" (a brief explanation of the changes)
236
 
 
238
  )
239
 
240
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
241
+ logging.info("fine_tune_report: Raw container adjustment response: %s", response_adjust)
242
  try:
243
  response_adjust = clean_llm_response(response_adjust.strip().strip("```"))
244
+ logging.info("Cleaned container adjustment response: %s", response_adjust)
245
  adjust_data = json.loads(response_adjust)
246
  corrected_container = adjust_data.get("improved", "").strip()
247
  container_summary = adjust_data.get("summary", "").strip()
248
  except Exception as e:
249
+ logging.error("fine_tune_report: Error parsing container adjustment JSON: %s", e)
250
  continue
251
+
252
  if not corrected_container:
253
+ logging.warning("fine_tune_report: No improved container was generated; skipping correction for this container.")
254
  continue
255
+
256
  corrections_summary.append(f"Container corrected: {container_summary}")
257
 
258
+ # Step 4: Replace the original container with the updated one in our soup.
259
+ container_tag.replace_with(BeautifulSoup(corrected_container, "html.parser"))
260
  logging.info("fine_tune_report: Updated container re-injected.")
261
 
262
  updated_report_html = str(soup)
263
 
264
+ # (Step 5 and Step 6 remain as before to update the reference table and the QA log)
265
+
266
  prompt_refs = (
267
  f"You are a technical editor. Review the following updated report HTML. "
268
  f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
 
283
  next_sibling.replace_with(new_ref_html)
284
  logging.info("fine_tune_report: Reference table updated successfully.")
285
  except Exception as e:
286
+ logging.error("fine_tune_report: Error updating reference table: %s", e)
287
  else:
288
+ logging.info("fine_tune_report: No sibling after reference heading; skipping reference update.")
289
  updated_report_html = str(soup_updated)
290
  else:
291
  logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
292
  else:
293
  logging.info("fine_tune_report: No updated reference table returned; leaving unchanged.")
294
 
 
295
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
296
  updated_qa = qa.strip() + "\n----------\n" + global_summary
297