foi-request-generator

Sleeping

App Files Files

HMC83 commited on Sep 2, 2025

Commit

dbe7eb9

verified ·

1 Parent(s): c10fcc4

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -52

app.py CHANGED Viewed

@@ -3,10 +3,11 @@ import os
 import random
 import time
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
-MODEL_ID = "HMC83/Wihtgar-650M-DPO-Requests-2"
 # --- Load Model and Tokenizer ---
 print("Loading model and tokenizer...")
@@ -179,46 +180,33 @@ FOI_COMBINATIONS = [
 ALL_AUTHORITIES_FOR_SPIN = list(set([item["authority"] for item in FOI_COMBINATIONS]))
 ALL_KEYWORDS_FOR_SPIN = list(set(kw.strip() for item in FOI_COMBINATIONS for kw in item["keywords"].split(',')))
-# --- Helper Function for Cleaning and Validation ---
-def clean_and_validate_output(raw_text: str) -> tuple[str, bool]:
     """
-    Cleans the model's output by keeping only the first complete request.
-    It validates that the output contains essential markers ("Dear" and "[Your Name]").
-    If it detects that the model has started generating a second request, it truncates
-    the string after the first "[Your Name]".
-    Args:
-        raw_text: The raw string output from the language model.
-    Returns:
-        A tuple containing:
-        - The cleaned text.
-        - A boolean flag: True if the output is valid, False if it is malformed.
     """
-    end_marker = "[Your Name]"
-    start_marker = "Dear"
-    # Validate: A valid request must contain the end marker.
-    if end_marker not in raw_text:
-        return raw_text, False  # Malformed, signal for regeneration.
-    # Find the end of the first complete request.
-    first_end_pos = raw_text.find(end_marker)
-    end_of_first_request_index = first_end_pos + len(end_marker)
-    # Check if a second request has started after the first one ended.
-    start_of_second_request_pos = raw_text.find(start_marker, end_of_first_request_index)
-    if start_of_second_request_pos != -1:
-        # If a second request is found, truncate to keep only the first one.
-        cleaned_text = raw_text[:end_of_first_request_index]
-        return cleaned_text, True
-    else:
-        # No second request found, the output is valid.
-        return raw_text, True
 # --- Backend Function for Local Inference ---
 @spaces.GPU
@@ -231,7 +219,9 @@ def generate_request_local(authority, kw1, kw2, kw3):
     keyword_string = ", ".join(keywords)
     prompt = (
         "You are an expert at writing formal Freedom of Information requests to UK public authorities. "
-        f"""Generate a formal Freedom of Information request to {authority} using these keywords: {keyword_string}"""
     )
     max_retries = 2
@@ -243,7 +233,7 @@ def generate_request_local(authority, kw1, kw2, kw3):
             # Set generation parameters
             generation_params = {
                 "max_new_tokens": 340,
-                "temperature": 0.3,
                 "top_p": 0.95,
                 "top_k": 50,
                 "repetition_penalty": 1.1,
@@ -264,22 +254,24 @@ def generate_request_local(authority, kw1, kw2, kw3):
             if generated_text.startswith('.\n'):
                 generated_text = generated_text[2:]
-            # **NEW**: Clean and validate the output
             cleaned_text, is_valid = clean_and_validate_output(generated_text)
             if is_valid:
-                return cleaned_text  # Success! Return the valid, cleaned text.
             else:
-                print(f"Attempt {attempt + 1}/{max_retries}: Malformed output detected. Retrying...")
         except Exception as e:
             print(f"Error during generation attempt {attempt + 1}/{max_retries}: {e}")
             if attempt == max_retries - 1:
                 return f"An error occurred during text generation: {e}"
-    # If the loop finishes, all retries have failed
-    return "Failed to generate a valid request after multiple attempts. Please try again."
 # --- Gradio UI and Spinning Logic ---
 def spin_the_reels():
@@ -298,22 +290,22 @@ def spin_the_reels():
             "Spinning..."
         )
         time.sleep(spin_interval)
     # 2. Select the final fixed combination
     final_combination = random.choice(FOI_COMBINATIONS)
     final_authority = final_combination["authority"]
     # Split, strip, and pad keywords to ensure we always have 3 for the UI
     keywords_list = [k.strip() for k in final_combination["keywords"].split(',')]
     keywords_list += [''] * (3 - len(keywords_list)) # Pad with empty strings if < 3
     kw1, kw2, kw3 = keywords_list[:3] # Take the first 3
     # Display the final reel values and a "Generating..." message
     yield (
         final_authority, kw1, kw2, kw3,
         f"Generating request for {final_authority}...\nPlease wait, this may take a moment."
     )
     # 3. Call the local model and yield the final result
     generated_request = generate_request_local(final_authority, kw1, kw2, kw3)
     yield (
@@ -369,9 +361,9 @@ with gr.Blocks(css=reels_css, theme=gr.themes.Soft()) as demo:
         reel2 = gr.Textbox(label="Keyword 1", interactive=False, elem_id="reel-2", scale=1)
         reel3 = gr.Textbox(label="Keyword 2", interactive=False, elem_id="reel-3", scale=1)
         reel4 = gr.Textbox(label="Keyword 3", interactive=False, elem_id="reel-4", scale=1)
     pull_button = gr.Button("Generate a request", variant="primary", elem_id="pull-button")
     output_request = gr.Textbox(
         label="Generated FOI Request",
         lines=15,
@@ -386,4 +378,4 @@ with gr.Blocks(css=reels_css, theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import random
 import time
 import torch
+import re  # <-- NEW
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
+MODEL_ID = "HMC83/Wihtgar-650M-SFT-Requests_2-Merged"
 # --- Load Model and Tokenizer ---
 print("Loading model and tokenizer...")
 ALL_AUTHORITIES_FOR_SPIN = list(set([item["authority"] for item in FOI_COMBINATIONS]))
 ALL_KEYWORDS_FOR_SPIN = list(set(kw.strip() for item in FOI_COMBINATIONS for kw in item["keywords"].split(',')))
+# --- Helper: clean model output into a numbered list starting at "1." ---
+def clean_and_validate_output(text: str):
     """
+    Extract the main numbered list starting at '1.' and strip any closing signature lines.
+    Always returns cleaned text and a boolean flag (True = looks fine).
     """
+    # Keep everything from the first "1." onward, if present.
+    m = re.search(r'(?m)^\s*1\.\s', text)
+    body = text[m.start():].strip() if m else text.strip()
+    # Remove common signature lines at the end (best-effort).
+    body = re.sub(r'(?im)^\s*(yours.*|kind regards.*|regards.*)$', '', body).strip()
+    # If it doesn't contain at least one numbered point, it's still usable, but we mark as not strictly-valid.
+    is_valid = bool(re.search(r'(?m)^\s*\d+\.\s', body))
+    return body, is_valid
+# --- Helper: wrap content in the FOI letter template ---
+def wrap_in_letter(authority: str, body: str) -> str:
+    body = body.strip()
+    template = (
+        f"Dear {authority}\n\n"
+        "Please provide me with a copy of the following information:\n\n"
+        f"{body}\n\n"
+        "Yours faithfully,"
+    )
+    return template
 # --- Backend Function for Local Inference ---
 @spaces.GPU
     keyword_string = ", ".join(keywords)
     prompt = (
         "You are an expert at writing formal Freedom of Information requests to UK public authorities. "
+        f"Generate ONLY the numbered list of the specific information being requested, starting at '1.' "
+        f"for {authority}, using these keywords: {keyword_string}. "
+        "Do not include greetings or signatures."
     )
     max_retries = 2
             # Set generation parameters
             generation_params = {
                 "max_new_tokens": 340,
+                "temperature": 0.0,
                 "top_p": 0.95,
                 "top_k": 50,
                 "repetition_penalty": 1.1,
             if generated_text.startswith('.\n'):
                 generated_text = generated_text[2:]
+            # Clean and validate the output
             cleaned_text, is_valid = clean_and_validate_output(generated_text)
+            # Wrap in the letter template regardless; validation just influences retry behavior
+            letter = wrap_in_letter(authority, cleaned_text)
             if is_valid:
+                return letter
             else:
+                print(f"Attempt {attempt + 1}/{max_retries}: Output lacked clear numbering. Retrying...")
         except Exception as e:
             print(f"Error during generation attempt {attempt + 1}/{max_retries}: {e}")
             if attempt == max_retries - 1:
                 return f"An error occurred during text generation: {e}"
+    # If retries failed, return the best effort letter using the last cleaned text we had
+    return wrap_in_letter(authority, "1. [Unable to format automatically] Please restate the information requested.\n2. [Optional second point]")
 # --- Gradio UI and Spinning Logic ---
 def spin_the_reels():
             "Spinning..."
         )
         time.sleep(spin_interval)
     # 2. Select the final fixed combination
     final_combination = random.choice(FOI_COMBINATIONS)
     final_authority = final_combination["authority"]
     # Split, strip, and pad keywords to ensure we always have 3 for the UI
     keywords_list = [k.strip() for k in final_combination["keywords"].split(',')]
     keywords_list += [''] * (3 - len(keywords_list)) # Pad with empty strings if < 3
     kw1, kw2, kw3 = keywords_list[:3] # Take the first 3
     # Display the final reel values and a "Generating..." message
     yield (
         final_authority, kw1, kw2, kw3,
         f"Generating request for {final_authority}...\nPlease wait, this may take a moment."
     )
     # 3. Call the local model and yield the final result
     generated_request = generate_request_local(final_authority, kw1, kw2, kw3)
     yield (
         reel2 = gr.Textbox(label="Keyword 1", interactive=False, elem_id="reel-2", scale=1)
         reel3 = gr.Textbox(label="Keyword 2", interactive=False, elem_id="reel-3", scale=1)
         reel4 = gr.Textbox(label="Keyword 3", interactive=False, elem_id="reel-4", scale=1)
     pull_button = gr.Button("Generate a request", variant="primary", elem_id="pull-button")
     output_request = gr.Textbox(
         label="Generated FOI Request",
         lines=15,
     )
 if __name__ == "__main__":
+    demo.launch()