Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import openai
|
|
| 10 |
import PyPDF2
|
| 11 |
import tempfile
|
| 12 |
import logging
|
|
|
|
| 13 |
import markdown
|
| 14 |
import unicodedata
|
| 15 |
import pdfkit
|
|
@@ -88,9 +89,6 @@ Your Answer:"""
|
|
| 88 |
updated_history = chat_history + [[user_message, answer]]
|
| 89 |
return updated_history, ""
|
| 90 |
|
| 91 |
-
import difflib
|
| 92 |
-
from bs4 import BeautifulSoup
|
| 93 |
-
|
| 94 |
def expand_snippet_area(full_html: str, snippet: str) -> str:
|
| 95 |
"""
|
| 96 |
Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
|
|
@@ -131,7 +129,6 @@ def find_best_matching_snippet(chunk_html: str, report_html: str) -> str:
|
|
| 131 |
if similarity > best_similarity:
|
| 132 |
best_similarity = similarity
|
| 133 |
best_snippet = str(tag)
|
| 134 |
-
# Accept if similarity is reasonably high; threshold can be adjusted.
|
| 135 |
if best_similarity > 0.6:
|
| 136 |
return best_snippet
|
| 137 |
return ""
|
|
@@ -165,17 +162,17 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 165 |
Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
|
| 166 |
|
| 167 |
Process Overview:
|
| 168 |
-
1. The function submits the full report HTML
|
| 169 |
-
The prompt instructs the model to output a JSON object containing
|
| 170 |
-
(
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
3. For each container, a second LLM call is made
|
| 174 |
-
|
| 175 |
-
4. The
|
| 176 |
-
5.
|
| 177 |
-
|
| 178 |
-
6. A summary of
|
| 179 |
|
| 180 |
Parameters:
|
| 181 |
adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
|
|
@@ -186,88 +183,83 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 186 |
initial_request: The original research query or request.
|
| 187 |
qa: Existing clarification Q&A log.
|
| 188 |
target_style: The stylistic guidelines the report should follow.
|
| 189 |
-
knowledge_crumbs: Aggregated source
|
| 190 |
complementary_guidance: Additional instructions.
|
| 191 |
|
| 192 |
Returns:
|
| 193 |
A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
|
| 194 |
"""
|
| 195 |
-
import os
|
| 196 |
-
import json
|
| 197 |
-
import logging
|
| 198 |
-
from bs4 import BeautifulSoup
|
| 199 |
-
|
| 200 |
-
# Set API keys as environment variables for downstream calls.
|
| 201 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 202 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 203 |
|
| 204 |
logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
|
| 205 |
|
| 206 |
# ---------------------------------------------------------------
|
| 207 |
-
# Step 1: Identify
|
| 208 |
#
|
| 209 |
-
# The prompt
|
| 210 |
-
#
|
| 211 |
-
#
|
| 212 |
-
# should be adjusted per the user request.
|
| 213 |
# ---------------------------------------------------------------
|
| 214 |
prompt_identify = (
|
| 215 |
f"You are a meticulous technical editor. Below is the full report HTML together with a "
|
| 216 |
-
f"user adjustment request. Identify
|
| 217 |
-
f"
|
| 218 |
-
f"
|
| 219 |
-
f"
|
| 220 |
f"Full Report HTML:\n{report_html}\n\n"
|
| 221 |
f"User Adjustment Request:\n{adjustment_request}\n\n"
|
| 222 |
f"Only output valid JSON."
|
| 223 |
)
|
| 224 |
|
| 225 |
response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
|
| 226 |
-
logging.info(f"fine_tune_report: Raw
|
| 227 |
|
| 228 |
try:
|
| 229 |
response_identify = response_identify.strip().strip("```")
|
| 230 |
id_data = json.loads(response_identify)
|
| 231 |
-
|
| 232 |
except Exception as e:
|
| 233 |
-
logging.error(f"fine_tune_report: Error parsing
|
| 234 |
-
|
| 235 |
|
| 236 |
-
if not
|
| 237 |
-
logging.warning("fine_tune_report: No
|
| 238 |
return report_html, qa
|
| 239 |
|
| 240 |
# ---------------------------------------------------------------
|
| 241 |
-
# Step 2: For each
|
| 242 |
# ---------------------------------------------------------------
|
| 243 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 244 |
-
updated_report_html = report_html
|
| 245 |
corrections_summary = []
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
if not candidate:
|
| 252 |
-
logging.warning(f"fine_tune_report: The
|
| 253 |
continue
|
| 254 |
|
| 255 |
original_container_html = str(candidate)
|
| 256 |
-
logging.info("fine_tune_report: Found container
|
| 257 |
|
| 258 |
# ---------------------------------------------------------------
|
| 259 |
-
# Step 3:
|
| 260 |
#
|
| 261 |
-
#
|
| 262 |
-
#
|
| 263 |
-
# version that applies the adjustment request.
|
| 264 |
# ---------------------------------------------------------------
|
| 265 |
prompt_adjust = (
|
| 266 |
-
f"You are a technical editor. Given the following HTML container (with its outer tags) "
|
| 267 |
-
f"
|
| 268 |
-
f"
|
| 269 |
-
f"
|
| 270 |
-
f"
|
| 271 |
f"Overall Report HTML:\n{report_html}\n\n"
|
| 272 |
f"Original Container to Adjust:\n{original_container_html}\n\n"
|
| 273 |
f"User Adjustment Request:\n{adjustment_request}\n\n"
|
|
@@ -276,34 +268,34 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 276 |
)
|
| 277 |
|
| 278 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
| 279 |
-
logging.info(f"fine_tune_report: Raw adjustment response: {response_adjust}")
|
| 280 |
|
| 281 |
try:
|
| 282 |
response_adjust = response_adjust.strip().strip("```")
|
| 283 |
adjust_data = json.loads(response_adjust)
|
| 284 |
corrected_container = adjust_data.get("improved", "").strip()
|
| 285 |
-
|
| 286 |
except Exception as e:
|
| 287 |
-
logging.error(f"fine_tune_report: Error parsing
|
| 288 |
continue
|
| 289 |
|
| 290 |
if not corrected_container:
|
| 291 |
-
logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this
|
| 292 |
continue
|
| 293 |
|
| 294 |
-
corrections_summary.append(f"Container corrected: {
|
| 295 |
|
| 296 |
# ---------------------------------------------------------------
|
| 297 |
-
# Step 4: Replace the original container in the BeautifulSoup object.
|
| 298 |
# ---------------------------------------------------------------
|
| 299 |
candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
|
| 300 |
-
logging.info("fine_tune_report:
|
| 301 |
|
| 302 |
# Get the updated report HTML from the modified soup.
|
| 303 |
updated_report_html = str(soup)
|
| 304 |
|
| 305 |
# ---------------------------------------------------------------
|
| 306 |
-
# Step 5: Update the reference table if
|
| 307 |
# ---------------------------------------------------------------
|
| 308 |
prompt_refs = (
|
| 309 |
f"You are a technical editor. Review the following updated report HTML. "
|
|
@@ -316,7 +308,6 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 316 |
|
| 317 |
if updated_refs:
|
| 318 |
soup_updated = BeautifulSoup(updated_report_html, "html.parser")
|
| 319 |
-
# Look for a heading that includes something like "Reference Summary Table"
|
| 320 |
ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
|
| 321 |
if ref_heading:
|
| 322 |
next_sibling = ref_heading.find_next_sibling()
|
|
@@ -328,7 +319,7 @@ def fine_tune_report(adjustment_request: str, openai_api_key: str, serpapi_api_k
|
|
| 328 |
except Exception as e:
|
| 329 |
logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
|
| 330 |
else:
|
| 331 |
-
logging.info("fine_tune_report: No sibling element found after reference heading; skipping reference update.")
|
| 332 |
updated_report_html = str(soup_updated)
|
| 333 |
else:
|
| 334 |
logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
|
|
|
|
| 10 |
import PyPDF2
|
| 11 |
import tempfile
|
| 12 |
import logging
|
| 13 |
+
import difflib
|
| 14 |
import markdown
|
| 15 |
import unicodedata
|
| 16 |
import pdfkit
|
|
|
|
| 89 |
updated_history = chat_history + [[user_message, answer]]
|
| 90 |
return updated_history, ""
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
def expand_snippet_area(full_html: str, snippet: str) -> str:
|
| 93 |
"""
|
| 94 |
Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
|
|
|
|
| 129 |
if similarity > best_similarity:
|
| 130 |
best_similarity = similarity
|
| 131 |
best_snippet = str(tag)
|
|
|
|
| 132 |
if best_similarity > 0.6:
|
| 133 |
return best_snippet
|
| 134 |
return ""
|
|
|
|
| 162 |
Fine-tunes an HTML report based on a user’s correction request by processing complete container elements.
|
| 163 |
|
| 164 |
Process Overview:
|
| 165 |
+
1. The function submits the full report HTML together with the user’s adjustment request to the LLM.
|
| 166 |
+
The prompt instructs the model to output a JSON object containing one or more unique plain text string(s)
|
| 167 |
+
(without HTML tags) that uniquely identify the targeted area(s) in the report.
|
| 168 |
+
2. For each returned unique string, the algorithm uses BeautifulSoup (and expand_snippet_area) to search for
|
| 169 |
+
the exact text and select the outer container (<div>, <table>, or <iframe>).
|
| 170 |
+
3. For each container, a second LLM call is made that takes in the container’s full HTML, the full report context,
|
| 171 |
+
and the user adjustment request, and outputs a corrected version.
|
| 172 |
+
4. The code then replaces the original container with the updated version in the BeautifulSoup object.
|
| 173 |
+
5. If new inline citations have been introduced (beyond those in the reference table), a final LLM call updates
|
| 174 |
+
the reference table.
|
| 175 |
+
6. A summary of all corrections is appended to the QA log.
|
| 176 |
|
| 177 |
Parameters:
|
| 178 |
adjustment_request: A string such as "the visual after 'xyz' is not displaying properly, please fix it" or
|
|
|
|
| 183 |
initial_request: The original research query or request.
|
| 184 |
qa: Existing clarification Q&A log.
|
| 185 |
target_style: The stylistic guidelines the report should follow.
|
| 186 |
+
knowledge_crumbs: Aggregated source or search result content.
|
| 187 |
complementary_guidance: Additional instructions.
|
| 188 |
|
| 189 |
Returns:
|
| 190 |
A tuple (updated_report_html, updated_qa) with the corrected report and updated QA log.
|
| 191 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 193 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 194 |
|
| 195 |
logging.info("fine_tune_report: Starting fine-tuning process based on the adjustment request.")
|
| 196 |
|
| 197 |
# ---------------------------------------------------------------
|
| 198 |
+
# Step 1: Identify unique string(s) that are representative of the targeted area.
|
| 199 |
#
|
| 200 |
+
# The prompt now asks the LLM to extract one or more unique plain text strings (without HTML)
|
| 201 |
+
# that appear in the targeted area(s) identified by the user adjustment request. These strings
|
| 202 |
+
# will be used to locate the corresponding container elements.
|
|
|
|
| 203 |
# ---------------------------------------------------------------
|
| 204 |
prompt_identify = (
|
| 205 |
f"You are a meticulous technical editor. Below is the full report HTML together with a "
|
| 206 |
+
f"user adjustment request. Identify one or more unique text strings (without any HTML tags or formatting) "
|
| 207 |
+
f"that are representative of the area(s) targeted by the adjustment request. Return these unique strings in a JSON "
|
| 208 |
+
f"object with the key \"identified_unique_strings\" mapped to a list of strings. Ensure that these strings are exact "
|
| 209 |
+
f"as they appear in the report so that they can be used to accurately locate the relevant section(s).\n\n"
|
| 210 |
f"Full Report HTML:\n{report_html}\n\n"
|
| 211 |
f"User Adjustment Request:\n{adjustment_request}\n\n"
|
| 212 |
f"Only output valid JSON."
|
| 213 |
)
|
| 214 |
|
| 215 |
response_identify = openai_call(prompt=prompt_identify, model="o3-mini", max_tokens_param=1500, temperature=0)
|
| 216 |
+
logging.info(f"fine_tune_report: Raw unique string identification response: {response_identify}")
|
| 217 |
|
| 218 |
try:
|
| 219 |
response_identify = response_identify.strip().strip("```")
|
| 220 |
id_data = json.loads(response_identify)
|
| 221 |
+
unique_strings = id_data.get("identified_unique_strings", [])
|
| 222 |
except Exception as e:
|
| 223 |
+
logging.error(f"fine_tune_report: Error parsing unique strings JSON: {e}")
|
| 224 |
+
unique_strings = []
|
| 225 |
|
| 226 |
+
if not unique_strings:
|
| 227 |
+
logging.warning("fine_tune_report: No unique strings were identified for adjustment. Returning original report.")
|
| 228 |
return report_html, qa
|
| 229 |
|
| 230 |
# ---------------------------------------------------------------
|
| 231 |
+
# Step 2: For each unique string, locate its corresponding container.
|
| 232 |
# ---------------------------------------------------------------
|
| 233 |
soup = BeautifulSoup(report_html, "html.parser")
|
|
|
|
| 234 |
corrections_summary = []
|
| 235 |
+
for uniq_str in unique_strings:
|
| 236 |
+
uniq_str = uniq_str.strip()
|
| 237 |
+
# Use expand_snippet_area to get the full container outer HTML that encloses the unique text.
|
| 238 |
+
container_html = expand_snippet_area(report_html, uniq_str)
|
| 239 |
+
if not container_html:
|
| 240 |
+
logging.warning(f"fine_tune_report: Could not locate a container for unique string: {uniq_str}")
|
| 241 |
+
continue
|
| 242 |
+
# Now, search the soup for a tag that includes this container HTML.
|
| 243 |
+
candidate = soup.find(lambda tag: container_html in str(tag))
|
| 244 |
if not candidate:
|
| 245 |
+
logging.warning(f"fine_tune_report: The container for the unique string was not found in the report:\n{uniq_str}")
|
| 246 |
continue
|
| 247 |
|
| 248 |
original_container_html = str(candidate)
|
| 249 |
+
logging.info("fine_tune_report: Found container for unique string adjustment.")
|
| 250 |
|
| 251 |
# ---------------------------------------------------------------
|
| 252 |
+
# Step 3: Call the LLM to adjust this container.
|
| 253 |
#
|
| 254 |
+
# Pass the entire container HTML, the full report context, and the adjustment request.
|
| 255 |
+
# The LLM should output a JSON object with the keys "improved" and "summary".
|
|
|
|
| 256 |
# ---------------------------------------------------------------
|
| 257 |
prompt_adjust = (
|
| 258 |
+
f"You are a technical editor. Given the following HTML container (with its outer tags) extracted "
|
| 259 |
+
f"from a larger report and based on the user adjustment request, produce a corrected version by making "
|
| 260 |
+
f"only the necessary changes. Preserve existing inline citations, formatting, and context. Ensure the updated content "
|
| 261 |
+
f"remains consistent with the overall report style. Output your answer as a JSON object with exactly two keys: "
|
| 262 |
+
f"\"improved\" (the corrected container's full HTML) and \"summary\" (a brief explanation of the changes applied).\n\n"
|
| 263 |
f"Overall Report HTML:\n{report_html}\n\n"
|
| 264 |
f"Original Container to Adjust:\n{original_container_html}\n\n"
|
| 265 |
f"User Adjustment Request:\n{adjustment_request}\n\n"
|
|
|
|
| 268 |
)
|
| 269 |
|
| 270 |
response_adjust = openai_call(prompt=prompt_adjust, model="o3-mini", max_tokens_param=2000, temperature=0.0)
|
| 271 |
+
logging.info(f"fine_tune_report: Raw container adjustment response: {response_adjust}")
|
| 272 |
|
| 273 |
try:
|
| 274 |
response_adjust = response_adjust.strip().strip("```")
|
| 275 |
adjust_data = json.loads(response_adjust)
|
| 276 |
corrected_container = adjust_data.get("improved", "").strip()
|
| 277 |
+
container_summary = adjust_data.get("summary", "").strip()
|
| 278 |
except Exception as e:
|
| 279 |
+
logging.error(f"fine_tune_report: Error parsing container adjustment JSON: {e}")
|
| 280 |
continue
|
| 281 |
|
| 282 |
if not corrected_container:
|
| 283 |
+
logging.warning("fine_tune_report: No improved container was returned by the LLM; skipping this container.")
|
| 284 |
continue
|
| 285 |
|
| 286 |
+
corrections_summary.append(f"Container corrected: {container_summary}")
|
| 287 |
|
| 288 |
# ---------------------------------------------------------------
|
| 289 |
+
# Step 4: Replace the original container with the corrected container in the BeautifulSoup object.
|
| 290 |
# ---------------------------------------------------------------
|
| 291 |
candidate.replace_with(BeautifulSoup(corrected_container, "html.parser"))
|
| 292 |
+
logging.info("fine_tune_report: Updated container re-injected into the report.")
|
| 293 |
|
| 294 |
# Get the updated report HTML from the modified soup.
|
| 295 |
updated_report_html = str(soup)
|
| 296 |
|
| 297 |
# ---------------------------------------------------------------
|
| 298 |
+
# Step 5: (Optional) Update the reference table if new inline citations exist.
|
| 299 |
# ---------------------------------------------------------------
|
| 300 |
prompt_refs = (
|
| 301 |
f"You are a technical editor. Review the following updated report HTML. "
|
|
|
|
| 308 |
|
| 309 |
if updated_refs:
|
| 310 |
soup_updated = BeautifulSoup(updated_report_html, "html.parser")
|
|
|
|
| 311 |
ref_heading = soup_updated.find(lambda tag: tag.name in ["h1", "h2", "h3", "h4"] and "Reference Summary Table" in tag.get_text())
|
| 312 |
if ref_heading:
|
| 313 |
next_sibling = ref_heading.find_next_sibling()
|
|
|
|
| 319 |
except Exception as e:
|
| 320 |
logging.error(f"fine_tune_report: Error replacing the reference table: {e}")
|
| 321 |
else:
|
| 322 |
+
logging.info("fine_tune_report: No sibling element found after reference table heading; skipping reference update.")
|
| 323 |
updated_report_html = str(soup_updated)
|
| 324 |
else:
|
| 325 |
logging.info("fine_tune_report: No reference table heading found; reference update skipped.")
|