File size: 6,896 Bytes
7d59f2d
 
 
 
 
1147c3b
7d59f2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "yujiangw/AutoGEO_mini_Qwen1.7B"


device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    trust_remote_code=True
)
model.eval()


DEFAULT_RULES_LIST = [
    "Attribute all factual claims to credible, authoritative sources with clear citations.",
    "Cover the topic comprehensively, addressing all key aspects and sub-topics.",
    "Ensure information is factually accurate and verifiable.",
    "Focus exclusively on the topic, eliminating irrelevant information, navigational links, and advertisements.",
    "Maintain a neutral, objective tone, avoiding promotional language, personal opinions, and bias.",
    "Maintain high-quality writing, free from grammatical errors, typos, and formatting issues.",
    "Present a balanced perspective on complex topics, acknowledging multiple significant viewpoints or counter-arguments.",
    "Present information as a self-contained unit, not requiring external links for core understanding.",
    "Provide clear, specific, and actionable steps.",
    "Provide explanatory depth by clarifying underlying causes, mechanisms, and context ('how' and 'why').",
    "State the key conclusion at the beginning of the document.",
    "Structure content logically with clear headings, lists, and paragraphs to ensure a cohesive flow.",
    "Substantiate claims with specific, concrete details like data, statistics, or named examples.",
    "Use clear and concise language, avoiding jargon, ambiguity, and verbosity.",
    "Use current information, reflecting the latest state of knowledge."
]

DEFAULT_RULES_TEXT = "\n".join(DEFAULT_RULES_LIST)


def build_autogeo_prompt(summary: str, rules_text: str | None) -> str:
    if rules_text is not None and rules_text.strip():
        rules = [line.strip() for line in rules_text.splitlines() if line.strip()]
    else:
        rules = DEFAULT_RULES_LIST

    rules_string = "- " + "\n- ".join(rules)

    user_prompt = f"""
Here is the source:
{summary}
You are given a website document as a source. This source, along with other sources, will be used by a language model (LLM) to generate answers to user questions, with each line in the generated answer being cited with its original source. Your task, as the owner of the source, is to rewrite your document in a way that maximizes its visibility and impact in the LLM's final answer, ensuring your source is more likely to be quoted and cited.
You can regenerate the provided source so that it strictly adheres to the "Quality Guidelines", and you can also apply any other methods or techniques, as long as they help your rewritten source text rank higher in terms of relevance, authority, and impact in the LLM's generated answers.
## Quality Guidelines to Follow:
{rules_string}
Now rewrite the source accordingly.
""".strip()

    return user_prompt


@torch.no_grad()
def rewrite_document(
    raw_page: str,
    rules_text: str,
    temperature: float = 0.7,
    max_new_tokens: int = 2048,
    top_p: float = 0.9
) -> str:
    if not raw_page.strip():
        return "Please paste the original web page content in the input box."

    prompt = build_autogeo_prompt(raw_page, rules_text)

    inputs = tokenizer(
        prompt,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    output_ids = model.generate(
        **inputs,
        max_new_tokens=int(max_new_tokens),
        do_sample=True,
        temperature=float(temperature),
        top_p=float(top_p),
        pad_token_id=tokenizer.eos_token_id
    )

    generated_ids = output_ids[0][inputs["input_ids"].shape[-1]:]
    text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return text.strip()


with gr.Blocks(title="AutoGEO Mini Rewriting Demo") as demo:
    gr.Markdown(
        """
# AutoGEO Mini Rewriting Demo
Paste an original web page/document on the left.  
Optionally customize the rewriting rules in the middle.  
The rewritten document will appear on the right.
If the rules area is left empty, the demo will use the default rule set
**extracted on the Researchy-GEO dataset with Gemini-2.5-Flash-Lite as the generative engine**.
        """
    )

    with gr.Row(equal_height=True):
        with gr.Column(scale=4):
            gr.Markdown("### 1️⃣ Original web page content")
            input_box = gr.Textbox(
                lines=22,
                label="",
                placeholder="Paste the original web page HTML/text here...",
                show_label=False
            )

        with gr.Column(scale=3):
            gr.Markdown(
                """
### 2️⃣ Rewriting rules (one rule per line)
- You can edit, add, or remove rules below.
- If you clear this box and leave it empty, the default AutoGEO rule set will be used.
                """
            )
            rules_box = gr.Textbox(
                value=DEFAULT_RULES_TEXT,
                lines=22,
                label="Custom rules (optional)",
                placeholder="One rule per line. Leave empty to use the default Researchy-GEO rule set."
            )

        with gr.Column(scale=4):
            gr.Markdown("### 3️⃣ Rewritten document")
            output_box = gr.Textbox(
                lines=22,
                label="",
                placeholder="Model output will appear here.",
                show_label=False
            )

    gr.Markdown("---")

    with gr.Row():
        with gr.Column(scale=3):
            temperature_slider = gr.Slider(
                minimum=0.1,
                maximum=1.5,
                value=0.7,
                step=0.05,
                label="Temperature"
            )
        with gr.Column(scale=3):
            max_tokens_slider = gr.Slider(
                minimum=256,
                maximum=2048,
                value=1024,
                step=64,
                label="Max new tokens"
            )
        with gr.Column(scale=3):
            top_p_slider = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top-p"
            )
        with gr.Column(scale=2, min_width=120):
            submit_btn = gr.Button(
                "Rewrite with AutoGEO Mini",
                variant="primary"
            )

    submit_btn.click(
        fn=rewrite_document,
        inputs=[input_box, rules_box, temperature_slider, max_tokens_slider, top_p_slider],
        outputs=[output_box]
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())