oberbics commited on
Commit
110e9ac
·
verified ·
1 Parent(s): 78595fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -217
app.py CHANGED
@@ -4,29 +4,12 @@ os.environ["OMP_NUM_THREADS"] = "1"
4
  import gradio as gr
5
  import torch
6
  import re
7
- import json
8
- import datetime
9
- import logging
10
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
11
  import spaces
12
 
13
- # Configure logging
14
- logging.basicConfig(
15
- level=logging.INFO,
16
- format='%(asctime)s - %(levelname)s - %(message)s',
17
- handlers=[
18
- logging.FileHandler('argument_extraction.log'),
19
- logging.StreamHandler()
20
- ]
21
- )
22
- logger = logging.getLogger(__name__)
23
-
24
  # Model configuration
25
  MODEL_ID = "oberbics/newspaper-argument-mining-V1"
26
 
27
- # Add minimum length threshold for arguments
28
- MIN_ARGUMENT_LENGTH = 50 # Adjust this value as needed
29
-
30
  SYSTEM_PROMPT = """You are an expert at analyzing historical texts and you hate to summarize
31
 
32
  OUTPUT FORMAT - EXACTLY these 4 XML tags and NOTHING else:
@@ -58,13 +41,10 @@ RULES:
58
  - More than one argumentative unit possible for one aticle, one unit has one clear clame and all the xml structures"""
59
 
60
  print("Loading tokenizer...")
61
- logger.info("Starting tokenizer loading")
62
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
63
  tokenizer.pad_token = tokenizer.eos_token
64
- logger.info("Tokenizer loaded successfully")
65
 
66
  print("Loading model...")
67
- logger.info("Starting model loading")
68
  bnb_config = BitsAndBytesConfig(
69
  load_in_4bit=True,
70
  bnb_4bit_quant_type="nf4",
@@ -80,216 +60,77 @@ model = AutoModelForCausalLM.from_pretrained(
80
  trust_remote_code=True
81
  )
82
  print("Model loaded successfully!")
83
- logger.info("Model loaded successfully")
84
 
85
- def parse_and_filter_arguments(response, min_length=MIN_ARGUMENT_LENGTH):
86
- """Parse XML response and filter out arguments that are too short"""
87
- try:
88
- # Extract argument text using regex
89
- argument_match = re.search(r'<argument>(.*?)</argument>', response, re.DOTALL)
90
-
91
- if argument_match:
92
- argument_text = argument_match.group(1).strip()
93
-
94
- # Check if argument is meaningful and long enough
95
- if argument_text and argument_text != "NA" and len(argument_text) < min_length:
96
- logger.info(f"Argument filtered out due to length: {len(argument_text)} chars (min: {min_length})")
97
-
98
- # Replace with NA format
99
- filtered_response = """<argument>NA</argument>
100
- <claim>NA</claim>
101
- <explanation>NA</explanation>
102
- <human_verification_needed>False</human_verification_needed>"""
103
- return filtered_response, True # True indicates it was filtered
104
-
105
- return response, False # False indicates no filtering
106
-
107
- except Exception as e:
108
- logger.error(f"Error parsing arguments: {e}")
109
- return response, False
110
 
111
- def log_interaction(input_text, temperature, output, processing_time, error=None, filtered=False):
112
- """Log each interaction to JSON file for analysis"""
113
- log_entry = {
114
- "timestamp": datetime.datetime.now().isoformat(),
115
- "input_length": len(input_text) if input_text else 0,
116
- "input_preview": input_text[:100] if input_text else "",
117
- "temperature": temperature,
118
- "output_length": len(output) if output else 0,
119
- "processing_time_seconds": processing_time,
120
- "has_error": error is not None,
121
- "error_message": str(error) if error else None,
122
- "output_preview": output[:200] if output else "",
123
- "filtered_for_length": filtered
124
- }
125
-
126
- # Save to JSON file
127
- try:
128
- with open('interaction_logs.json', 'a') as f:
129
- f.write(json.dumps(log_entry) + '\n')
130
- except Exception as e:
131
- logger.error(f"Failed to save interaction log: {e}")
132
 
133
  @spaces.GPU
134
- def extract_arguments(text, temperature=0.1, min_arg_length=MIN_ARGUMENT_LENGTH):
135
- start_time = datetime.datetime.now()
136
- logger.info(f"Processing request - Input length: {len(text) if text else 0}, Temperature: {temperature}, Min argument length: {min_arg_length}")
137
-
138
  if not text or not text.strip():
139
- error_msg = "Please enter some text to analyze."
140
- logger.warning("Empty input received")
141
- log_interaction(text, temperature, "", 0, error_msg)
142
- return "", error_msg
143
 
144
- try:
145
- prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
146
  {SYSTEM_PROMPT}<|eot_id|>
147
  <|start_header_id|>user<|end_header_id|>
148
  Extract arguments from historical text.
149
  {text}<|eot_id|>
150
  <|start_header_id|>assistant<|end_header_id|>"""
151
-
152
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=5048).to(model.device)
153
- input_length = inputs['input_ids'].shape[1]
154
-
155
- try:
156
- if temperature is None:
157
- temperature = 0.1
158
- else:
159
- temperature = float(temperature)
160
-
161
- if temperature < 0.01:
162
- temperature = 0.01
163
- elif temperature > 0.3:
164
- temperature = 0.3
165
- except:
166
- temperature = 0.1
167
-
168
- logger.info(f"Starting model generation with {input_length} input tokens")
169
-
170
- with torch.no_grad():
171
- outputs = model.generate(
172
- **inputs,
173
- max_new_tokens=5000,
174
- temperature=temperature,
175
- do_sample=True if temperature > 0.01 else False,
176
- top_p=0.9,
177
- pad_token_id=tokenizer.eos_token_id,
178
- repetition_penalty=1.1
179
- )
180
-
181
- generated_tokens = outputs[0][input_length:]
182
- response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
183
-
184
- # Fix XML start
185
- if not response.startswith('<argument>'):
186
- arg_start = response.find('<argument>')
187
- if arg_start != -1:
188
- response = response[arg_start:]
189
-
190
- # Filter out short arguments
191
- filtered_response, was_filtered = parse_and_filter_arguments(response, min_arg_length)
192
-
193
- processing_time = (datetime.datetime.now() - start_time).total_seconds()
194
- logger.info(f"Processing completed in {processing_time:.2f} seconds - Output length: {len(filtered_response)} - Filtered: {was_filtered}")
195
-
196
- # Log successful interaction
197
- log_interaction(text, temperature, filtered_response, processing_time, filtered=was_filtered)
198
-
199
- return filtered_response
200
-
201
- except Exception as e:
202
- processing_time = (datetime.datetime.now() - start_time).total_seconds()
203
- error_msg = f"Error during processing: {str(e)}"
204
- logger.error(f"Processing failed after {processing_time:.2f} seconds: {e}")
205
-
206
- # Log failed interaction
207
- log_interaction(text, temperature, "", processing_time, e)
208
-
209
- return error_msg
210
-
211
- def get_logs():
212
- """Function to view recent logs"""
213
- try:
214
- with open('interaction_logs.json', 'r') as f:
215
- lines = f.readlines()
216
- recent_logs = lines[-10:] # Last 10 interactions
217
-
218
- log_summary = []
219
- for line in recent_logs:
220
- entry = json.loads(line)
221
- confidence_info = f", Confidence: {entry['confidence_score']:.3f}" if entry.get('confidence_score') else ""
222
- filtered_info = " [FILTERED]" if entry.get('filtered_for_length') else ""
223
- rejected_info = " [REJECTED]" if entry.get('rejected') else ""
224
- summary = f"[{entry['timestamp']}] Input: {entry['input_length']} chars, Output: {entry['output_length']} chars, Time: {entry['processing_time_seconds']:.2f}s{confidence_info}{filtered_info}{rejected_info}"
225
- if entry['has_error']:
226
- summary += f" ERROR: {entry['error_message']}"
227
- log_summary.append(summary)
228
-
229
- return "\n".join(log_summary)
230
- except Exception as e:
231
- return f"Error reading logs: {e}"
232
-
233
- # Gradio interface with logging viewer and length control
234
- with gr.Blocks(title="Newspaper Argumentative Unit Extractor") as demo:
235
- gr.Markdown("# Newspaper Argumentative Unit Extractor")
236
- gr.Markdown("Extract argumentative units from news sources (filters out arguments shorter than specified length)")
237
 
238
- with gr.Tab("Extract Arguments"):
239
- with gr.Row():
240
- with gr.Column():
241
- input_text = gr.Textbox(
242
- label="Input Text",
243
- placeholder="Enter newspaper text here...",
244
- lines=10
245
- )
246
- temperature = gr.Slider(
247
- minimum=0.01,
248
- maximum=0.3,
249
- value=0.1,
250
- step=0.01,
251
- label="Temperature (lower = more consistent)"
252
- )
253
- min_length = gr.Slider(
254
- minimum=10,
255
- maximum=200,
256
- value=MIN_ARGUMENT_LENGTH,
257
- step=5,
258
- label="Minimum Argument Length (characters)"
259
- )
260
- extract_btn = gr.Button("Extract Arguments", variant="primary")
261
-
262
- with gr.Column():
263
- output_text = gr.Textbox(
264
- label="Raw XML Output",
265
- lines=8
266
- )
267
-
268
- extract_btn.click(
269
- fn=extract_arguments,
270
- inputs=[input_text, temperature, min_length],
271
- outputs=[output_text]
272
- )
273
-
274
- # Examples
275
- gr.Examples(
276
- examples=[
277
- ["Reggio, January 8. Frequent shocks of earthquake were felt here dur ing the night, accompanied at times by loud subter ranean reports. A few buildings that had not been completely destroyed were further damaged. The work of reconstructing the railway is being pushed forward energetically. News has been received from Brancaleone, Catanzaro, and Palmi of earthquakes by which the inhabitants were alarmed last night", 0.1, 50],
278
- ],
279
- inputs=[input_text, temperature, min_length],
280
- outputs=[output_text],
281
- fn=extract_arguments
282
  )
283
 
284
- with gr.Tab("Logs"):
285
- gr.Markdown("## Recent Activity Logs")
286
- log_display = gr.Textbox(
287
- label="Recent Interactions",
288
- lines=15,
289
- value=get_logs()
290
- )
291
- refresh_btn = gr.Button("Refresh Logs")
292
- refresh_btn.click(fn=get_logs, outputs=[log_display])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  if __name__ == "__main__":
295
- demo.launch()
 
4
  import gradio as gr
5
  import torch
6
  import re
 
 
 
7
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
8
  import spaces
9
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Model configuration
11
  MODEL_ID = "oberbics/newspaper-argument-mining-V1"
12
 
 
 
 
13
  SYSTEM_PROMPT = """You are an expert at analyzing historical texts and you hate to summarize
14
 
15
  OUTPUT FORMAT - EXACTLY these 4 XML tags and NOTHING else:
 
41
  - More than one argumentative unit possible for one aticle, one unit has one clear clame and all the xml structures"""
42
 
43
  print("Loading tokenizer...")
 
44
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
45
  tokenizer.pad_token = tokenizer.eos_token
 
46
 
47
  print("Loading model...")
 
48
  bnb_config = BitsAndBytesConfig(
49
  load_in_4bit=True,
50
  bnb_4bit_quant_type="nf4",
 
60
  trust_remote_code=True
61
  )
62
  print("Model loaded successfully!")
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  @spaces.GPU
68
+ def extract_arguments(text, temperature=0.1):
 
 
 
69
  if not text or not text.strip():
70
+ return "", "Please enter some text to analyze."
 
 
 
71
 
72
+ prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
73
  {SYSTEM_PROMPT}<|eot_id|>
74
  <|start_header_id|>user<|end_header_id|>
75
  Extract arguments from historical text.
76
  {text}<|eot_id|>
77
  <|start_header_id|>assistant<|end_header_id|>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=5048).to(model.device)
80
+ input_length = inputs['input_ids'].shape[1]
81
+ try:
82
+ if temperature is None:
83
+ temperature = 0.1
84
+ else:
85
+ temperature = float(temperature)
86
+
87
+ if temperature < 0.01:
88
+ temperature = 0.01
89
+ elif temperature > 0.3:
90
+ temperature = 0.3
91
+ except:
92
+ temperature = 0.1
93
+
94
+ with torch.no_grad():
95
+ outputs = model.generate(
96
+ **inputs,
97
+ max_new_tokens=5000,
98
+ temperature=temperature,
99
+ do_sample=True if temperature > 0.01 else False,
100
+ top_p=0.9,
101
+ pad_token_id=tokenizer.eos_token_id,
102
+ repetition_penalty=1.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  )
104
 
105
+ generated_tokens = outputs[0][input_length:]
106
+ response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
107
+
108
+ # Fix XML start
109
+ if not response.startswith('<argument>'):
110
+ arg_start = response.find('<argument>')
111
+ if arg_start != -1:
112
+ response = response[arg_start:]
113
+
114
+ return response
115
+
116
+
117
+ # Gradio interface
118
+ demo = gr.Interface(
119
+ fn=extract_arguments,
120
+ inputs=[
121
+ gr.Textbox(label="Input Text", placeholder="Enter newspaper text here...", lines=10),
122
+ gr.Slider(minimum=0.01, maximum=0.3, value=0.1, step=0.01, label="Temperature (lower = more consistent)")
123
+ ],
124
+ outputs=[
125
+ gr.Textbox(label="Raw XML Output", lines=8)
126
+ ],
127
+ title="Newspaper Argumentative Unit Extractor",
128
+ description="Extract argumentative units from news sources",
129
+ examples=[
130
+ ["Reggio, January 8. Frequent shocks of earthquake were felt here dur ing the night, accompanied at times by loud subter ranean reports. A few buildings that had not been completely destroyed were further damaged. The work of reconstructing the railway is being pushed forward energetically. News has been received from Brancaleone, Catanzaro, and Palmi of earthquakes by which the inhabitants were alarmed last night", 0.1],
131
+ ["The bourses and theatres are closed. In every quarter help committees have been estab lished. A central committee has been organised at Rome for the purpose of privately and publicly collecting donations, and organising relief expedi tions to the afflicted districts. The Duke of Aosta has accepted the presidency. From all parts of the globe come telegrams of sympathy. The entire press has founded relief funds. Every Ambassador and Minister in Rome personally visited the Ministry of the Exterior yesterday morning, and expressed sympathy on behalf of their respective countries. Doctors, firemen, and municipal guards have been despatched to Messina and Calabria from many Italian towns. The Lombard Bank of Milan has already distributed 250,000 lire to sufferers from the earthquake, and the city of Milan has sent 25 firemen to Messina. A curious result of the earthquake is that the craters of Aetna, Vesuvius, and Stromboli ceased their activity immediately after the shock. It is reported from Malta that the British war- ships “Exmouth,” “Euryalus,” “Minerva,” and “Sutlej” have left for Messina. The French Government has sent two armoured ships and three destroyers to Messina. President Fallieres, Premier Clemenceau, Minister Pichon, and the Presidents of the Senate and Chamber have all sent messages of sympathy to the Italian Government. Palermo, December 30. Yesterday evening the first official telegraphic des- patches from the prefect of Messina reached here, They state that the catastrophe is beyond human description. Many thousands of people are known to have perished. It is impossible, says the prefect, to accurately relate the frightful scenes witnessed. The help already proffered and accepted is insuffi cient for the purpose. There is pressing need of extraordinary measures of help, and provisions are in great demand. At the time of wiring the fires in many parts of the ruined city have not been got under control, and are spreading in many directions. Catania, December 30. A survivor from the catastrophe at Messina who has arrived here says: “It is impossible to describe the appalling scene. The city has been transformed into a vast heap of ruins. Almost all the inhabitants were killed; only a few thousands escaped death. There is need of doctors, tents, clothing, and pro visions for the survivors, who, deprived of all ne cessaries, are exposed to the inclemencies of the winter weather. There is need of fire engines to cope with the flames that are raging among the ruins. Messina appears as if it had been swept away by the earthquake. The railway station has collapsed. Railway carriages have been destroyed. Almost all the railway employes are dead. The streets are no longer recognisable; they look like enormous fissures in a distant and extensive heap of ruins. The Uni versity, the Post and Telegraph Office, and all the other public buildings have disappeared. The gas mains are entirely destroyed. For hours after the catastrophe the town was without any help, as the authorities, the garrison, the doctors, and apothe* * caries,—in short, all classes of the population, wjere buried under the ruins.” Three more trains and** a steamer have left Messina with vjarKled and gitives. Bremen, In cembcr 30, Information has been recei\ o, i rom the Rprt authorities at Naples that the ! ‘ -house in the Straits of Messina has been destrc* - ' It seems doubtful whether the navigation of ■ < l traits will be pos sible without risk. The N r.h German Lloyd has therefore ordered all its ship commanders to avoid the Straits. All communicauon with Sicily is inter rupted. Rome, December 30. Newspaper reports from Catanzaro state that the prefect of. Reggio, who was believed to have pe rished, has arrived there and says that he managed to escape from the prefecture when the greater part of the building had fallen in. The surrounding streets and the centre of the town down to the harbour have been totally destroyed. Only the small villas clustering in the hills surrounding the town and on the Promenade of Reggio and Campi are intact. The castle, the cathedral, and the Lyceum all collapsed, and practically every student in the Ly ceum met his or her death. The prefect adds that he believes the Bishop to be dead. The barracks fell in, burying hundreds of soldiers. Reports as to the fate of the council house are contradictory. All the fugitives from Reggio describe the disaster as frightful, and estimate the number of dead as (Continued on page 3)", 0.1]
132
+ ]
133
+ )
134
 
135
  if __name__ == "__main__":
136
+ demo.launch()