Guiyom commited on
Commit
7835b45
·
verified ·
1 Parent(s): 7e262e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -75
app.py CHANGED
@@ -88,10 +88,25 @@ Your Answer:"""
88
  updated_history = chat_history + [[user_message, answer]]
89
  return updated_history, ""
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def expand_snippet_area(full_html: str, snippet: str) -> str:
93
  """
94
- Given the full HTML and a small snippet (e.g., containing a keyword),
95
  find the element in which that snippet appears and traverse upward through the DOM
96
  until a larger container is reached. Allowed container tags include div, table, iframe, and section.
97
  The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
@@ -162,51 +177,24 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
162
  Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
163
 
164
  Process Overview:
165
- 1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
166
- The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
167
- (without HTML tags) that uniquely identify the targeted area(s) in the report.
168
- 2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area) to search for
169
- the exact text and select the outer container (<div>, <table>, or <iframe>).
170
- 3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
171
- and the user adjustment request, and outputs a corrected version.
172
- 4. The code then replaces the original container with the updated version in the BeautifulSoup object.
173
- 5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
174
- the reference table.
175
- 6. A summary of all corrections is appended to the QA log.
176
-
177
- Parameters:
178
- adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
179
- "the introduction should be more detailed, adjust it" etc.
180
- openai_api_key: OpenAI API Key.
181
- serpapi_api_key: SERPAPI API Key.
182
- report_html: A string containing the full HTML report.
183
- initial_request: The original research query or request.
184
- qa: Existing clarification Q&A log.
185
- target_style: The stylistic guidelines the report should follow.
186
- knowledge_crumbs: Aggregated source or search result content.
187
- complementary_guidance: Additional instructions.
188
-
189
- Returns:
190
- A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
191
  """
192
  os.environ["OPENAI_API_KEY"] = openai_api_key
193
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
194
 
195
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
196
 
197
- # ---------------------------------------------------------------
198
- # Step 1: Identify unique string(s) that are representative of the targeted area.
199
- #
200
- # The prompt now asks the LLM to extract one or more unique plain text strings (without HTML)
201
- # that appear in the targeted area(s) identified by the user adjustment request. These strings
202
- # will be used to locate the corresponding container elements.
203
- # ---------------------------------------------------------------
204
  prompt_identify = (
205
  f"You are a meticulous technical editor. Below is the full report HTML together with a "
206
- f"user adjustment request. Identify one or more unique text strings (without any HTML tags or formatting) "
207
- f"that are representative of the area(s) targeted by the adjustment request. Return these unique strings in a JSON "
208
- f"object with the key \"identified_unique_strings\" mapped to a list of strings. Ensure that these strings are exact "
209
- f"as they appear in the report so that they can be used to accurately locate the relevant section(s).\n\n"
210
  f"Full Report HTML:\n{report_html}\n\n"
211
  f"User Adjustment Request:\n{adjustment_request}\n\n"
212
  f"Only output valid JSON."
@@ -216,7 +204,7 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
216
  logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
217
 
218
  try:
219
- response_identify = response_identify.strip().strip("```")
220
  id_data = json.loads(response_identify)
221
  unique_strings = id_data.get("identified_unique_strings", [])
222
  except Exception as e:
@@ -227,39 +215,30 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
227
  logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
228
  return report_html, qa
229
 
230
- # ---------------------------------------------------------------
231
- # Step 2: For each unique string, locate its corresponding container.
232
- # ---------------------------------------------------------------
233
  soup = BeautifulSoup(report_html, "html.parser")
234
  corrections_summary = []
235
  for uniq_str in unique_strings:
236
  uniq_str = uniq_str.strip()
237
- # Use expand_snippet_area to get the full container outer HTML that encloses the unique text.
238
  container_html = expand_snippet_area(report_html, uniq_str)
239
  if not container_html:
240
  logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
241
  continue
242
- # Now, search the soup for a tag that includes this container HTML.
243
  candidate = soup.find(lambda tag: container_html in str(tag))
244
  if not candidate:
245
- logging.warning(f"fine_tune_report: The container for the unique string was not found in the report:\n{uniq_str}")
246
  continue
247
 
248
  original_container_html = str(candidate)
249
  logging.info("fine_tune_report: Found container for unique string adjustment.")
250
 
251
- # ---------------------------------------------------------------
252
  # Step 3: Call the LLM to adjust this container.
253
- #
254
- # Pass the entire container HTML, the full report context, and the adjustment request.
255
- # The LLM should output a JSON object with the keys "improved" and "summary".
256
- # ---------------------------------------------------------------
257
  prompt_adjust = (
258
  f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
259
  f"from a larger report and based on the user adjustment request, produce a corrected version by making "
260
- f"only the necessary changes. Preserve existing inline citations, formatting, and context. Ensure the updated content "
261
- f"remains consistent with the overall report style. Output your answer as a JSON object with exactly two keys: "
262
- f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
263
  f"Overall Report HTML:\n{report_html}\n\n"
264
  f"Original Container to Adjust:\n{original_container_html}\n\n"
265
  f"User Adjustment Request:\n{adjustment_request}\n\n"
@@ -269,9 +248,8 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
269
 
270
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
271
  logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
272
-
273
  try:
274
- response_adjust = response_adjust.strip().strip("```")
275
  adjust_data = json.loads(response_adjust)
276
  corrected_container = adjust_data.get("improved", "").strip()
277
  container_summary = adjust_data.get("summary", "").strip()
@@ -280,31 +258,25 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
280
  continue
281
 
282
  if not corrected_container:
283
- logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this container.")
284
  continue
285
 
286
  corrections_summary.append(f"Container corrected: {container_summary}")
287
-
288
- # ---------------------------------------------------------------
289
- # Step 4: Replace the original container with the corrected container in the BeautifulSoup object.
290
- # ---------------------------------------------------------------
291
  candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
292
- logging.info("fine_tune_report: Updated container re-injected into the report.")
293
 
294
- # Get the updated report HTML from the modified soup.
295
  updated_report_html = str(soup)
296
 
297
- # ---------------------------------------------------------------
298
- # Step 5: (Optional) Update the reference table if new inline citations exist.
299
- # ---------------------------------------------------------------
300
  prompt_refs = (
301
  f"You are a technical editor. Review the following updated report HTML. "
302
- f"If there are any new inline citations (e.g., [x]) not present in the original reference table, "
303
- f"generate an updated Reference Summary Table as valid HTML containing all references. Output only the HTML code for the updated reference table with no commentary.\n\n"
304
  f"Updated Report HTML:\n{updated_report_html}"
305
  )
306
  updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
307
- updated_refs = updated_refs.strip().strip("```")
308
 
309
  if updated_refs:
310
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
@@ -317,22 +289,20 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
317
  next_sibling.replace_with(new_ref_html)
318
  logging.info("fine_tune_report: Reference table updated successfully.")
319
  except Exception as e:
320
- logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
321
  else:
322
- logging.info("fine_tune_report: No sibling element found after reference table heading; skipping reference update.")
323
  updated_report_html = str(soup_updated)
324
  else:
325
- logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
326
  else:
327
  logging.info("fine_tune_report: No updated reference table returned; leaving references unchanged.")
328
 
329
- # ---------------------------------------------------------------
330
- # Step 6: Append a summary of corrections to the existing QA log.
331
- # ---------------------------------------------------------------
332
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
333
  updated_qa = qa.strip() + "\n----------\n" + global_summary
334
 
335
- logging.info("fine_tune_report: Fine-tuning process completed.")
336
  return updated_report_html, updated_qa
337
 
338
  def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key: str) -> str:
 
88
  updated_history = chat_history + [[user_message, answer]]
89
  return updated_history, ""
90
 
91
+ def clean_llm_response(response: str) -> str:
92
+ """
93
+ Clean the raw LLM response by removing code fences and replacing newline characters
94
+ with spaces so that the resulting string is valid JSON.
95
+ """
96
+ # Remove any leading/trailing whitespace and code fence markers
97
+ cleaned = response.strip()
98
+ if cleaned.startswith("```"):
99
+ cleaned = cleaned.lstrip("```")
100
+ if cleaned.endswith("```"):
101
+ cleaned = cleaned.rstrip("```")
102
+ # Remove newline characters (replace with space) and collapse multiple spaces
103
+ cleaned = cleaned.replace("\n", " ")
104
+ cleaned = re.sub(r'\s+', ' ', cleaned)
105
+ return cleaned.strip()
106
 
107
  def expand_snippet_area(full_html: str, snippet: str) -> str:
108
  """
109
+ Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
110
  find the element in which that snippet appears and traverse upward through the DOM
111
  until a larger container is reached. Allowed container tags include div, table, iframe, and section.
112
  The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
 
177
  Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
178
 
179
  Process Overview:
180
+ 1. Submits full report HTML and the user adjustment request to the LLM and gets back one or more unique plain text strings.
181
+ 2. For each unique string, uses BeautifulSoup (and expand_snippet_area) to retrieve the outer container (<div>, <table>, or <iframe>).
182
+ 3. For each container, calls the LLM (with full report context, crumbs, and adjustment request) to output a corrected version.
183
+ 4. Replaces the original container with the corrected version.
184
+ 5. Optionally updates the reference table if new inline citations appear.
185
+ 6. Appends a corrections summary to the QA log.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  """
187
  os.environ["OPENAI_API_KEY"] = openai_api_key
188
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
189
 
190
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
191
 
192
+ # Step 1: Identify unique plain text string(s) that pinpoint the targeted area.
 
 
 
 
 
 
193
  prompt_identify = (
194
  f"You are a meticulous technical editor. Below is the full report HTML together with a "
195
+ f"user adjustment request. Extract one or more unique plain-text string(s) (without any HTML tags or formatting) "
196
+ f"that uniquely appear in the area targeted by the adjustment request. Output them in a JSON object with the key "
197
+ f"\"identified_unique_strings\" mapped to a list of strings.\n\n"
 
198
  f"Full Report HTML:\n{report_html}\n\n"
199
  f"User Adjustment Request:\n{adjustment_request}\n\n"
200
  f"Only output valid JSON."
 
204
  logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
205
 
206
  try:
207
+ response_identify = clean_llm_response(response_identify)
208
  id_data = json.loads(response_identify)
209
  unique_strings = id_data.get("identified_unique_strings", [])
210
  except Exception as e:
 
215
  logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
216
  return report_html, qa
217
 
218
+ # Step 2: For each unique string, locate the corresponding outer container.
 
 
219
  soup = BeautifulSoup(report_html, "html.parser")
220
  corrections_summary = []
221
  for uniq_str in unique_strings:
222
  uniq_str = uniq_str.strip()
 
223
  container_html = expand_snippet_area(report_html, uniq_str)
224
  if not container_html:
225
  logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
226
  continue
 
227
  candidate = soup.find(lambda tag: container_html in str(tag))
228
  if not candidate:
229
+ logging.warning(f"fine_tune_report: The container for unique string was not found: {uniq_str}")
230
  continue
231
 
232
  original_container_html = str(candidate)
233
  logging.info("fine_tune_report: Found container for unique string adjustment.")
234
 
 
235
  # Step 3: Call the LLM to adjust this container.
 
 
 
 
236
  prompt_adjust = (
237
  f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
238
  f"from a larger report and based on the user adjustment request, produce a corrected version by making "
239
+ f"only the necessary changes. Preserve inline citations, formatting, and context. Ensure the updated content "
240
+ f"remains consistent with the overall report style. Output a JSON object with two keys: "
241
+ f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes).\n\n"
242
  f"Overall Report HTML:\n{report_html}\n\n"
243
  f"Original Container to Adjust:\n{original_container_html}\n\n"
244
  f"User Adjustment Request:\n{adjustment_request}\n\n"
 
248
 
249
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
250
  logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
 
251
  try:
252
+ response_adjust = clean_llm_response(response_adjust)
253
  adjust_data = json.loads(response_adjust)
254
  corrected_container = adjust_data.get("improved", "").strip()
255
  container_summary = adjust_data.get("summary", "").strip()
 
258
  continue
259
 
260
  if not corrected_container:
261
+ logging.warning("fine_tune_report: No improved container was generated; skipping this container.")
262
  continue
263
 
264
  corrections_summary.append(f"Container corrected: {container_summary}")
265
+ # Step 4: Replace the original container with the updated container.
 
 
 
266
  candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
267
+ logging.info("fine_tune_report: Updated container re-injected.")
268
 
 
269
  updated_report_html = str(soup)
270
 
271
+ # Step 5: (Optional) Update reference table if needed.
 
 
272
  prompt_refs = (
273
  f"You are a technical editor. Review the following updated report HTML. "
274
+ f"If any new inline citations (e.g., [x]) have been introduced that are not in the original reference table, "
275
+ f"generate an updated Reference Summary Table as valid HTML. Output only the updated reference table HTML with no commentary.\n\n"
276
  f"Updated Report HTML:\n{updated_report_html}"
277
  )
278
  updated_refs = openai_call(prompt=prompt_refs, model="o3-mini", max_tokens_param=1000, temperature=0.5)
279
+ updated_refs = clean_llm_response(updated_refs)
280
 
281
  if updated_refs:
282
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
 
289
  next_sibling.replace_with(new_ref_html)
290
  logging.info("fine_tune_report: Reference table updated successfully.")
291
  except Exception as e:
292
+ logging.error(f"fine_tune_report: Error updating reference table: {e}")
293
  else:
294
+ logging.info("fine_tune_report: No sibling after reference heading; skipping update.")
295
  updated_report_html = str(soup_updated)
296
  else:
297
+ logging.info("fine_tune_report: No reference table heading found; skipping reference update.")
298
  else:
299
  logging.info("fine_tune_report: No updated reference table returned; leaving references unchanged.")
300
 
301
+ # Step 6: Append corrections summary to the QA log.
 
 
302
  global_summary = "Corrections Applied Based on User Request:\n" + "\n".join(corrections_summary)
303
  updated_qa = qa.strip() + "\n----------\n" + global_summary
304
 
305
+ logging.info("fine_tune_report: Fine-tuning complete.")
306
  return updated_report_html, updated_qa
307
 
308
  def suggest_improvements(report_html: str, openai_api_key: str, serpapi_api_key: str) -> str: