Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -86,6 +86,28 @@ def send_chat_message(user_message, chat_history, report_text, crumbs_text):
|
|
| 86 |
updated_history = chat_history + [[user_message, answer]]
|
| 87 |
return updated_history, ""
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
|
| 90 |
initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
|
| 91 |
complementary_guidance: str) -> (str, str):
|
|
@@ -93,12 +115,16 @@ def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api
|
|
| 93 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 94 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 95 |
|
| 96 |
-
# Parse the
|
| 97 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 98 |
|
| 99 |
-
# --- Specific adjustment
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
if adjustmentguidelines.strip():
|
| 101 |
-
extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including
|
| 102 |
|
| 103 |
User Instruction: "{adjustmentguidelines}"
|
| 104 |
|
|
@@ -106,7 +132,7 @@ Report HTML:
|
|
| 106 |
{report_html}
|
| 107 |
|
| 108 |
Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
|
| 109 |
-
Do not
|
| 110 |
"""
|
| 111 |
extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
|
| 112 |
try:
|
|
@@ -117,11 +143,14 @@ Do not output any extra commentary or markdown formatting.
|
|
| 117 |
logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
|
| 118 |
identified_snippets = []
|
| 119 |
if identified_snippets:
|
| 120 |
-
|
| 121 |
-
#
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
| 125 |
else:
|
| 126 |
logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
|
| 127 |
all_chunks = []
|
|
@@ -132,15 +161,16 @@ Do not output any extra commentary or markdown formatting.
|
|
| 132 |
all_guidelines = []
|
| 133 |
all_token_sizes = []
|
| 134 |
|
| 135 |
-
# --- Fallback
|
| 136 |
if not all_chunks:
|
| 137 |
designated_chunks = soup.find_all("div", class_="improvable-chunk")
|
| 138 |
-
global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability
|
|
|
|
| 139 |
|
| 140 |
-
Please provide
|
| 141 |
|
| 142 |
"identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
|
| 143 |
-
"chunk_adjustment_guidelines": A list of guideline strings (each with bullet points)
|
| 144 |
"chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
|
| 145 |
|
| 146 |
Report HTML:
|
|
@@ -174,7 +204,6 @@ Knowledge Crumbs (search results):
|
|
| 174 |
chunk_adjustment_guidelines_from_llm = []
|
| 175 |
chunk_token_sizes_from_llm = []
|
| 176 |
|
| 177 |
-
# Process designated chunks (if any)
|
| 178 |
designated_chunks_html = []
|
| 179 |
designated_guidelines = []
|
| 180 |
designated_token_sizes = []
|
|
@@ -184,8 +213,8 @@ Knowledge Crumbs (search results):
|
|
| 184 |
designated_prompt = f"""Given the following report chunk:
|
| 185 |
{chunk_html}
|
| 186 |
|
| 187 |
-
Generate a JSON object with exactly two
|
| 188 |
-
"guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring
|
| 189 |
"token_size": An integer representing the recommended token size for processing this chunk.
|
| 190 |
"""
|
| 191 |
try:
|
|
@@ -200,7 +229,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
|
|
| 200 |
designated_guidelines.append("")
|
| 201 |
designated_token_sizes.append(1000)
|
| 202 |
designated_chunks_html.append(chunk_html)
|
| 203 |
-
# Merge the global LLM results and any designated chunks.
|
| 204 |
all_chunks = []
|
| 205 |
all_guidelines = []
|
| 206 |
all_token_sizes = []
|
|
@@ -212,7 +240,6 @@ Generate a JSON object with exactly two fields (no extra commentary):
|
|
| 212 |
all_chunks.extend(identified_chunks_from_llm)
|
| 213 |
all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
|
| 214 |
all_token_sizes.extend(chunk_token_sizes_from_llm)
|
| 215 |
-
# If still nothing, fall back to grouping paragraphs.
|
| 216 |
if not all_chunks:
|
| 217 |
all_paragraphs = soup.find_all("p")
|
| 218 |
group_size = max(1, len(all_paragraphs) // 10)
|
|
@@ -230,7 +257,7 @@ Generate a JSON object with exactly two fields (no extra commentary):
|
|
| 230 |
|
| 231 |
improvements_summary = [] # To store a plain text summary for each processed chunk
|
| 232 |
|
| 233 |
-
# --- Process each chunk
|
| 234 |
for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
|
| 235 |
chunk_prompt = f"""Improve the following report chunk based on these guidelines:
|
| 236 |
{guideline}
|
|
@@ -247,7 +274,7 @@ Knowledge Crumbs: {knowledge_crumbs}
|
|
| 247 |
Complementary Guidance: {complementary_guidance}
|
| 248 |
Full Report: {report_html}
|
| 249 |
|
| 250 |
-
Please output a JSON object with exactly two
|
| 251 |
{{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
|
| 252 |
"""
|
| 253 |
try:
|
|
@@ -258,7 +285,6 @@ Please output a JSON object with exactly two fields (no extra commentary):
|
|
| 258 |
chunk_summary = chunk_json.get("summary")
|
| 259 |
if improved_chunk and chunk_summary:
|
| 260 |
improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
|
| 261 |
-
# Replace the original chunk; find the corresponding content in the soup and replace it.
|
| 262 |
orig = BeautifulSoup(chunk_html, "html.parser")
|
| 263 |
new_chunk = BeautifulSoup(improved_chunk, "html.parser")
|
| 264 |
replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
|
|
@@ -274,12 +300,9 @@ Please output a JSON object with exactly two fields (no extra commentary):
|
|
| 274 |
except Exception as e:
|
| 275 |
logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
|
| 276 |
|
| 277 |
-
# Get the updated report HTML as a string.
|
| 278 |
final_report_html = str(soup)
|
| 279 |
-
# Create a plain text summary combining all improvements.
|
| 280 |
summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
|
| 281 |
global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
|
| 282 |
-
# Append both summaries (with a separator) to the original Q&A.
|
| 283 |
updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
|
| 284 |
|
| 285 |
return final_report_html, updated_qa
|
|
|
|
| 86 |
updated_history = chat_history + [[user_message, answer]]
|
| 87 |
return updated_history, ""
|
| 88 |
|
| 89 |
+
def expand_snippet_area(full_html: str, snippet: str) -> str:
|
| 90 |
+
"""
|
| 91 |
+
Given the full HTML and a small snippet (e.g., containing a keyword such as "abc"),
|
| 92 |
+
find the element in which that snippet appears and traverse upward through the DOM
|
| 93 |
+
until a larger container is reached. Allowed container tags include div, table, iframe, and section.
|
| 94 |
+
The traversal stops when the parent is no longer in the allowed list or before reaching <body>.
|
| 95 |
+
Returns the outer HTML of the found container.
|
| 96 |
+
"""
|
| 97 |
+
allowed_tags = {"div", "table", "iframe", "section"}
|
| 98 |
+
soup = BeautifulSoup(full_html, "html.parser")
|
| 99 |
+
candidate = soup.find(lambda tag: snippet in tag.get_text() if tag.get_text() else False)
|
| 100 |
+
if candidate:
|
| 101 |
+
current = candidate
|
| 102 |
+
while current.parent is not None and current.parent.name.lower() != "body":
|
| 103 |
+
if current.parent.name.lower() in allowed_tags:
|
| 104 |
+
current = current.parent
|
| 105 |
+
else:
|
| 106 |
+
break
|
| 107 |
+
return str(current)
|
| 108 |
+
else:
|
| 109 |
+
return snippet
|
| 110 |
+
|
| 111 |
def fine_tune_report(adjustmentguidelines: str, openai_api_key: str, serpapi_api_key: str, report_html: str,
|
| 112 |
initial_request: str, qa: str, target_style: str, knowledge_crumbs: str,
|
| 113 |
complementary_guidance: str) -> (str, str):
|
|
|
|
| 115 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
| 116 |
os.environ["SERPAPI_API_KEY"] = serpapi_api_key
|
| 117 |
|
| 118 |
+
# Parse the entire report HTML.
|
| 119 |
soup = BeautifulSoup(report_html, "html.parser")
|
| 120 |
|
| 121 |
+
# --- Specific adjustment extraction ---
|
| 122 |
+
# When an adjustment instruction (such as "rewrite the visual after 6.1") is provided,
|
| 123 |
+
# ask the LLM to locate every occurrence and extract the HTML snippet that needs adjustment.
|
| 124 |
+
# These snippets might be many (or none), and for each snippet, we then expand the selection
|
| 125 |
+
# to include its outer container (like a div, table, or iframe) for full context.
|
| 126 |
if adjustmentguidelines.strip():
|
| 127 |
+
extraction_prompt = f"""You are a technical editor. Review the following report HTML and, based on the specific user instruction below, extract only the precise HTML snippet(s) (including any meaningful surrounding context) that must be improved.
|
| 128 |
|
| 129 |
User Instruction: "{adjustmentguidelines}"
|
| 130 |
|
|
|
|
| 132 |
{report_html}
|
| 133 |
|
| 134 |
Provide a JSON object with a single key "identified_snippets" mapping to an array of HTML snippets that require adjustment.
|
| 135 |
+
Do not include any additional commentary or markdown formatting.
|
| 136 |
"""
|
| 137 |
extraction_result = openai_call(prompt=extraction_prompt, model="o3-mini", max_tokens_param=1500, temperature=0.5)
|
| 138 |
try:
|
|
|
|
| 143 |
logging.error(f"Error extracting snippets: {e}. Raw result: {extraction_result}")
|
| 144 |
identified_snippets = []
|
| 145 |
if identified_snippets:
|
| 146 |
+
expanded_snippets = []
|
| 147 |
+
# Process EVERY occurrence in the returned array.
|
| 148 |
+
for snippet in identified_snippets:
|
| 149 |
+
expanded = expand_snippet_area(report_html, snippet)
|
| 150 |
+
expanded_snippets.append(expanded)
|
| 151 |
+
all_chunks = expanded_snippets
|
| 152 |
+
all_guidelines = [adjustmentguidelines.strip() for _ in range(len(expanded_snippets))]
|
| 153 |
+
all_token_sizes = [1000] * len(expanded_snippets)
|
| 154 |
else:
|
| 155 |
logging.info("No specific snippets extracted with the adjustment instruction. Falling back to default global analysis.")
|
| 156 |
all_chunks = []
|
|
|
|
| 161 |
all_guidelines = []
|
| 162 |
all_token_sizes = []
|
| 163 |
|
| 164 |
+
# --- Fallback global analysis if no specific snippets were extracted ---
|
| 165 |
if not all_chunks:
|
| 166 |
designated_chunks = soup.find_all("div", class_="improvable-chunk")
|
| 167 |
+
global_chunk_prompt = f"""Review the entire report HTML provided below and identify specific sections that should be improved for clarity, consistency, and overall readability.
|
| 168 |
+
The identified chunks should be distributed across the document in order to enhance alignment with the initial request and complementary guidance.
|
| 169 |
|
| 170 |
+
Please provide a JSON object with exactly three keys (without additional commentary):
|
| 171 |
|
| 172 |
"identified_chunks": An array of HTML snippets representing the chunks to be adjusted.
|
| 173 |
+
"chunk_adjustment_guidelines": A list of guideline strings (each with bullet points) for each chunk.
|
| 174 |
"chunk_token_sizes": A list of integers indicating the recommended token size for processing each corresponding chunk.
|
| 175 |
|
| 176 |
Report HTML:
|
|
|
|
| 204 |
chunk_adjustment_guidelines_from_llm = []
|
| 205 |
chunk_token_sizes_from_llm = []
|
| 206 |
|
|
|
|
| 207 |
designated_chunks_html = []
|
| 208 |
designated_guidelines = []
|
| 209 |
designated_token_sizes = []
|
|
|
|
| 213 |
designated_prompt = f"""Given the following report chunk:
|
| 214 |
{chunk_html}
|
| 215 |
|
| 216 |
+
Generate a JSON object with exactly two keys (no extra commentary):
|
| 217 |
+
"guideline": A string with bullet-point guidelines on how to adjust this chunk, ensuring modifications align with the research query and that citations are updated ([x]).
|
| 218 |
"token_size": An integer representing the recommended token size for processing this chunk.
|
| 219 |
"""
|
| 220 |
try:
|
|
|
|
| 229 |
designated_guidelines.append("")
|
| 230 |
designated_token_sizes.append(1000)
|
| 231 |
designated_chunks_html.append(chunk_html)
|
|
|
|
| 232 |
all_chunks = []
|
| 233 |
all_guidelines = []
|
| 234 |
all_token_sizes = []
|
|
|
|
| 240 |
all_chunks.extend(identified_chunks_from_llm)
|
| 241 |
all_guidelines.extend(chunk_adjustment_guidelines_from_llm)
|
| 242 |
all_token_sizes.extend(chunk_token_sizes_from_llm)
|
|
|
|
| 243 |
if not all_chunks:
|
| 244 |
all_paragraphs = soup.find_all("p")
|
| 245 |
group_size = max(1, len(all_paragraphs) // 10)
|
|
|
|
| 257 |
|
| 258 |
improvements_summary = [] # To store a plain text summary for each processed chunk
|
| 259 |
|
| 260 |
+
# --- Process each chunk ---
|
| 261 |
for idx, (chunk_html, guideline, token_size) in enumerate(zip(all_chunks, all_guidelines, all_token_sizes), start=1):
|
| 262 |
chunk_prompt = f"""Improve the following report chunk based on these guidelines:
|
| 263 |
{guideline}
|
|
|
|
| 274 |
Complementary Guidance: {complementary_guidance}
|
| 275 |
Full Report: {report_html}
|
| 276 |
|
| 277 |
+
Please output a JSON object with exactly two keys (no extra commentary):
|
| 278 |
{{"improved": "<the improved chunk in valid HTML>", "summary": "<a brief summary of changes>"}}
|
| 279 |
"""
|
| 280 |
try:
|
|
|
|
| 285 |
chunk_summary = chunk_json.get("summary")
|
| 286 |
if improved_chunk and chunk_summary:
|
| 287 |
improvements_summary.append(f"Chunk {idx}: {chunk_summary}")
|
|
|
|
| 288 |
orig = BeautifulSoup(chunk_html, "html.parser")
|
| 289 |
new_chunk = BeautifulSoup(improved_chunk, "html.parser")
|
| 290 |
replacement = soup.find(string=lambda text: text and text.strip() in orig.get_text())
|
|
|
|
| 300 |
except Exception as e:
|
| 301 |
logging.error(f"Error processing chunk {idx}: {e}. Raw result: {chunk_result}")
|
| 302 |
|
|
|
|
| 303 |
final_report_html = str(soup)
|
|
|
|
| 304 |
summary_text = "Summary of Fine-Tuning Improvements:\n" + "\n".join(improvements_summary)
|
| 305 |
global_summary = "Combined Chunk Improvement Guidelines:\n" + "\n".join(all_guidelines)
|
|
|
|
| 306 |
updated_qa = qa.strip() + "\n----------\n" + global_summary + "\n" + summary_text
|
| 307 |
|
| 308 |
return final_report_html, updated_qa
|