Guiyom commited on
Commit
ecebfd6
·
verified ·
1 Parent(s): 8fdb5c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -65
app.py CHANGED
@@ -10,6 +10,7 @@ import openai
10
  import PyPDF2
11
  import tempfile
12
  import logging
 
13
  import markdown
14
  import unicodedata
15
  import pdfkit
@@ -88,9 +89,6 @@ Your Answer:"""
88
  updated_history = chat_history + [[user_message, answer]]
89
  return updated_history, ""
90
 
91
- import difflib
92
- from bs4 import BeautifulSoup
93
-
94
  def expand_snippet_area(full_html: str, snippet: str) -> str:
95
  """
96
  Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
@@ -131,7 +129,6 @@ def find_best_matching_snippet(chunk_html: str, report_html: str) -> str:
131
  if similarity > best_similarity:
132
  best_similarity = similarity
133
  best_snippet = str(tag)
134
- # Accept if similarity is reasonably high; threshold can be adjusted.
135
  if best_similarity > 0.6:
136
  return best_snippet
137
  return ""
@@ -165,17 +162,17 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
165
  Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
166
 
167
  Process Overview:
168
- 1. The function submits the full report HTML along with the user’s adjustment request to the LLM.
169
- The prompt instructs the model to output a JSON object containing the minimal unique container(s)
170
- (including their outer HTML—e.g. <iframe>, <div>, or <table>) that correspond to the content that needs
171
- fixing.
172
- 2. The identified container snippet(s) are then located in the report using BeautifulSoup.
173
- 3. For each container, a second LLM call is made to generate a corrected version that integrates the user
174
- instructions while maintaining context, citations, and overall style.
175
- 4. The old container markup is replaced by the corrected version directly in the BeautifulSoup object.
176
- 5. Finally, if new inline citations have been introduced (beyond those in the reference table), a final LLM
177
- call updates the reference table.
178
- 6. A summary of the corrections is appended to the QA log.
179
 
180
  Parameters:
181
  adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
@@ -186,88 +183,83 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
186
  initial_request: The original research query or request.
187
  qa: Existing clarification Q&A log.
188
  target_style: The stylistic guidelines the report should follow.
189
- knowledge_crumbs: Aggregated source/search result content.
190
  complementary_guidance: Additional instructions.
191
 
192
  Returns:
193
  A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
194
  """
195
- import os
196
- import json
197
- import logging
198
- from bs4 import BeautifulSoup
199
-
200
- # Set API keys as environment variables for downstream calls.
201
  os.environ["OPENAI_API_KEY"] = openai_api_key
202
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
203
 
204
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
205
 
206
  # ---------------------------------------------------------------
207
- # Step 1: Identify container snippet(s) needing adjustment.
208
  #
209
- # The prompt instructs the LLM to scan the full report and output a JSON object
210
- # with a key "identified_snippets" that contains complete HTML container elements
211
- # (including their outer tags) that uniquely correspond to the section(s) which
212
- # should be adjusted per the user request.
213
  # ---------------------------------------------------------------
214
  prompt_identify = (
215
  f"You are a meticulous technical editor. Below is the full report HTML together with a "
216
- f"user adjustment request. Identify the minimal, unique container(s) that capture the key content "
217
- f"relevant to the user instruction. The containers may be complete HTML elements such as a <div>, <iframe>, "
218
- f"<table>, etc. Output a JSON object with the key \"identified_snippets\" that maps to a list of these container "
219
- f"HTML snippets ONLY (include the outer tags). No commentary or additional text should be present.\n\n"
220
  f"Full Report HTML:\n{report_html}\n\n"
221
  f"User Adjustment Request:\n{adjustment_request}\n\n"
222
  f"Only output valid JSON."
223
  )
224
 
225
  response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
226
- logging.info(f"fine_tune_report: Raw snippet identification response: {response_identify}")
227
 
228
  try:
229
  response_identify = response_identify.strip().strip("```")
230
  id_data = json.loads(response_identify)
231
- identified_snippets = id_data.get("identified_snippets", [])
232
  except Exception as e:
233
- logging.error(f"fine_tune_report: Error parsing identified snippets JSON: {e}")
234
- identified_snippets = []
235
 
236
- if not identified_snippets:
237
- logging.warning("fine_tune_report: No specific container snippets were identified for adjustment. Returning original report.")
238
  return report_html, qa
239
 
240
  # ---------------------------------------------------------------
241
- # Step 2: For each identified container snippet, find it in the report.
242
  # ---------------------------------------------------------------
243
  soup = BeautifulSoup(report_html, "html.parser")
244
- updated_report_html = report_html
245
  corrections_summary = []
246
-
247
- for snippet in identified_snippets:
248
- snippet = snippet.strip()
249
- # Use BeautifulSoup to search for a tag whose complete outer HTML contains the snippet.
250
- candidate = soup.find(lambda tag: snippet in str(tag))
 
 
 
 
251
  if not candidate:
252
- logging.warning(f"fine_tune_report: The snippet could not be uniquely located in the report:\n{snippet}")
253
  continue
254
 
255
  original_container_html = str(candidate)
256
- logging.info("fine_tune_report: Found container snippet for adjustment.")
257
 
258
  # ---------------------------------------------------------------
259
- # Step 3: Send a prompt to the LLM to correct this container.
260
  #
261
- # Here the LLM is given the entire current container (the extracted full HTML)
262
- # and the full report context (and other guidance) and is asked to produce a corrected
263
- # version that applies the adjustment request.
264
  # ---------------------------------------------------------------
265
  prompt_adjust = (
266
- f"You are a technical editor. Given the following HTML container (with its outer tags) "
267
- f"extracted from a larger report and based on the user adjustment request, produce a corrected "
268
- f"version by making only the changes required. Preserve existing inline citations, formatting, and contextual details. "
269
- f"Ensure the updated content remains consistent with the overall report style. Output your answer as a JSON object "
270
- f"with exactly two keys: \"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
271
  f"Overall Report HTML:\n{report_html}\n\n"
272
  f"Original Container to Adjust:\n{original_container_html}\n\n"
273
  f"User Adjustment Request:\n{adjustment_request}\n\n"
@@ -276,34 +268,34 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
276
  )
277
 
278
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
279
- logging.info(f"fine_tune_report: Raw adjustment response: {response_adjust}")
280
 
281
  try:
282
  response_adjust = response_adjust.strip().strip("```")
283
  adjust_data = json.loads(response_adjust)
284
  corrected_container = adjust_data.get("improved", "").strip()
285
- snippet_summary = adjust_data.get("summary", "").strip()
286
  except Exception as e:
287
- logging.error(f"fine_tune_report: Error parsing snippet adjustment JSON: {e}")
288
  continue
289
 
290
  if not corrected_container:
291
- logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this snippet.")
292
  continue
293
 
294
- corrections_summary.append(f"Container corrected: {snippet_summary}")
295
 
296
  # ---------------------------------------------------------------
297
- # Step 4: Replace the original container in the BeautifulSoup object.
298
  # ---------------------------------------------------------------
299
  candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
300
- logging.info("fine_tune_report: Container snippet replaced.")
301
 
302
  # Get the updated report HTML from the modified soup.
303
  updated_report_html = str(soup)
304
 
305
  # ---------------------------------------------------------------
306
- # Step 5: Update the reference table if any new inline citations exist.
307
  # ---------------------------------------------------------------
308
  prompt_refs = (
309
  f"You are a technical editor. Review the following updated report HTML. "
@@ -316,7 +308,6 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
316
 
317
  if updated_refs:
318
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
319
- # Look for a heading that includes something like "Reference Summary Table"
320
  ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
321
  if ref_heading:
322
  next_sibling = ref_heading.find_next_sibling()
@@ -328,7 +319,7 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
328
  except Exception as e:
329
  logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
330
  else:
331
- logging.info("fine_tune_report: No sibling element found after reference heading; skipping reference update.")
332
  updated_report_html = str(soup_updated)
333
  else:
334
  logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
 
10
  import PyPDF2
11
  import tempfile
12
  import logging
13
+ import difflib
14
  import markdown
15
  import unicodedata
16
  import pdfkit
 
89
  updated_history = chat_history + [[user_message, answer]]
90
  return updated_history, ""
91
 
 
 
 
92
  def expand_snippet_area(full_html: str, snippet: str) -> str:
93
  """
94
  Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
 
129
  if similarity > best_similarity:
130
  best_similarity = similarity
131
  best_snippet = str(tag)
 
132
  if best_similarity > 0.6:
133
  return best_snippet
134
  return ""
 
162
  Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
163
 
164
  Process Overview:
165
+ 1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
166
+ The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
167
+ (without HTML tags) that uniquely identify the targeted area(s) in the report.
168
+ 2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area) to search for
169
+ the exact text and select the outer container (<div>, <table>, or <iframe>).
170
+ 3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
171
+ and the user adjustment request, and outputs a corrected version.
172
+ 4. The code then replaces the original container with the updated version in the BeautifulSoup object.
173
+ 5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
174
+ the reference table.
175
+ 6. A summary of all corrections is appended to the QA log.
176
 
177
  Parameters:
178
  adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
 
183
  initial_request: The original research query or request.
184
  qa: Existing clarification Q&A log.
185
  target_style: The stylistic guidelines the report should follow.
186
+ knowledge_crumbs: Aggregated source or search result content.
187
  complementary_guidance: Additional instructions.
188
 
189
  Returns:
190
  A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
191
  """
 
 
 
 
 
 
192
  os.environ["OPENAI_API_KEY"] = openai_api_key
193
  os.environ["SERPAPI_API_KEY"] = serpapi_api_key
194
 
195
  logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
196
 
197
  # ---------------------------------------------------------------
198
+ # Step 1: Identify unique string(s) that are representative of the targeted area.
199
  #
200
+ # The prompt now asks the LLM to extract one or more unique plain text strings (without HTML)
201
+ # that appear in the targeted area(s) identified by the user adjustment request. These strings
202
+ # will be used to locate the corresponding container elements.
 
203
  # ---------------------------------------------------------------
204
  prompt_identify = (
205
  f"You are a meticulous technical editor. Below is the full report HTML together with a "
206
+ f"user adjustment request. Identify one or more unique text strings (without any HTML tags or formatting) "
207
+ f"that are representative of the area(s) targeted by the adjustment request. Return these unique strings in a JSON "
208
+ f"object with the key \"identified_unique_strings\" mapped to a list of strings. Ensure that these strings are exact "
209
+ f"as they appear in the report so that they can be used to accurately locate the relevant section(s).\n\n"
210
  f"Full Report HTML:\n{report_html}\n\n"
211
  f"User Adjustment Request:\n{adjustment_request}\n\n"
212
  f"Only output valid JSON."
213
  )
214
 
215
  response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
216
+ logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
217
 
218
  try:
219
  response_identify = response_identify.strip().strip("```")
220
  id_data = json.loads(response_identify)
221
+ unique_strings = id_data.get("identified_unique_strings", [])
222
  except Exception as e:
223
+ logging.error(f"fine_tune_report: Error parsing unique strings JSON: {e}")
224
+ unique_strings = []
225
 
226
+ if not unique_strings:
227
+ logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
228
  return report_html, qa
229
 
230
  # ---------------------------------------------------------------
231
+ # Step 2: For each unique string, locate its corresponding container.
232
  # ---------------------------------------------------------------
233
  soup = BeautifulSoup(report_html, "html.parser")
 
234
  corrections_summary = []
235
+ for uniq_str in unique_strings:
236
+ uniq_str = uniq_str.strip()
237
+ # Use expand_snippet_area to get the full container outer HTML that encloses the unique text.
238
+ container_html = expand_snippet_area(report_html, uniq_str)
239
+ if not container_html:
240
+ logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
241
+ continue
242
+ # Now, search the soup for a tag that includes this container HTML.
243
+ candidate = soup.find(lambda tag: container_html in str(tag))
244
  if not candidate:
245
+ logging.warning(f"fine_tune_report: The container for the unique string was not found in the report:\n{uniq_str}")
246
  continue
247
 
248
  original_container_html = str(candidate)
249
+ logging.info("fine_tune_report: Found container for unique string adjustment.")
250
 
251
  # ---------------------------------------------------------------
252
+ # Step 3: Call the LLM to adjust this container.
253
  #
254
+ # Pass the entire container HTML, the full report context, and the adjustment request.
255
+ # The LLM should output a JSON object with the keys "improved" and "summary".
 
256
  # ---------------------------------------------------------------
257
  prompt_adjust = (
258
+ f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
259
+ f"from a larger report and based on the user adjustment request, produce a corrected version by making "
260
+ f"only the necessary changes. Preserve existing inline citations, formatting, and context. Ensure the updated content "
261
+ f"remains consistent with the overall report style. Output your answer as a JSON object with exactly two keys: "
262
+ f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
263
  f"Overall Report HTML:\n{report_html}\n\n"
264
  f"Original Container to Adjust:\n{original_container_html}\n\n"
265
  f"User Adjustment Request:\n{adjustment_request}\n\n"
 
268
  )
269
 
270
  response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
271
+ logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
272
 
273
  try:
274
  response_adjust = response_adjust.strip().strip("```")
275
  adjust_data = json.loads(response_adjust)
276
  corrected_container = adjust_data.get("improved", "").strip()
277
+ container_summary = adjust_data.get("summary", "").strip()
278
  except Exception as e:
279
+ logging.error(f"fine_tune_report: Error parsing container adjustment JSON: {e}")
280
  continue
281
 
282
  if not corrected_container:
283
+ logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this container.")
284
  continue
285
 
286
+ corrections_summary.append(f"Container corrected: {container_summary}")
287
 
288
  # ---------------------------------------------------------------
289
+ # Step 4: Replace the original container with the corrected container in the BeautifulSoup object.
290
  # ---------------------------------------------------------------
291
  candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
292
+ logging.info("fine_tune_report: Updated container re-injected into the report.")
293
 
294
  # Get the updated report HTML from the modified soup.
295
  updated_report_html = str(soup)
296
 
297
  # ---------------------------------------------------------------
298
+ # Step 5: (Optional) Update the reference table if new inline citations exist.
299
  # ---------------------------------------------------------------
300
  prompt_refs = (
301
  f"You are a technical editor. Review the following updated report HTML. "
 
308
 
309
  if updated_refs:
310
  soup_updated = BeautifulSoup(updated_report_html, "html.parser")
 
311
  ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
312
  if ref_heading:
313
  next_sibling = ref_heading.find_next_sibling()
 
319
  except Exception as e:
320
  logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
321
  else:
322
+ logging.info("fine_tune_report: No sibling element found after reference table heading; skipping reference update.")
323
  updated_report_html = str(soup_updated)
324
  else:
325
  logging.info("fine_tune_report: No reference table heading found; reference update skipped.")