alxd commited on
Commit
50ffeff
Β·
1 Parent(s): b7811cf

basic cleaning tasks

Browse files
Files changed (2) hide show
  1. pdf2txt.py +98 -173
  2. requirements.txt +2 -0
pdf2txt.py CHANGED
@@ -8,17 +8,16 @@ import threading
8
  import uuid
9
  import queue
10
  import time
 
11
  from transformers import AutoTokenizer
12
  from mistralai import Mistral
13
  from huggingface_hub import InferenceClient
14
 
15
-
16
  # ------------------------------
17
  # Helper functions and globals
18
  # ------------------------------
19
  sheet_data = None
20
  file_name = None
21
- sheet = None
22
 
23
  def debug_print(message: str):
24
  print(f"[{datetime.datetime.now().isoformat()}] {message}", flush=True)
@@ -41,7 +40,7 @@ def count_tokens(text: str) -> int:
41
  return len(text.split())
42
 
43
  def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
44
- full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}" # Append sheet data to prompt
45
 
46
  if "Mistral" in model_name:
47
  mistral_api_key = os.getenv("MISTRAL_API_KEY")
@@ -73,30 +72,61 @@ def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
73
  else:
74
  raise ValueError("Invalid model selection. Please choose either 'Mistral-API' or 'Meta-Llama-3'.")
75
 
76
-
77
  def process_query(prompt: str, model_name: str):
78
  global sheet_data
79
 
80
- # Handle the case where sheet_data might be None
81
  if sheet_data is None:
82
  sheet_data = get_sheet_data()
83
 
84
- full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}" # Append sheet data to prompt
85
  debug_print(f"Processing query with model {model_name}: {full_prompt}")
86
 
87
- # Generate the response using the specified model and sheet data
88
  response = generate_response(prompt, model_name, sheet_data)
89
-
90
- # Count the number of tokens for input and output
91
- input_tokens = count_tokens(prompt + "\n\n" + sheet_data) # Include sheet data in the input token count
92
  output_tokens = count_tokens(response)
93
 
94
- # Return the response along with token counts
95
  return response, f"Input tokens: {input_tokens}", f"Output tokens: {output_tokens}"
96
 
97
  def ui_process_query(prompt, model_name):
98
  return process_query(prompt, model_name)
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # ------------------------------
101
  # Global variables for background jobs
102
  # ------------------------------
@@ -114,7 +144,6 @@ def get_job_list():
114
  if not jobs:
115
  return "No jobs found. Submit a query or load files to create jobs."
116
 
117
- # Sort jobs by start time (newest first)
118
  sorted_jobs = sorted(
119
  [(job_id, job_info) for job_id, job_info in jobs.items()],
120
  key=lambda x: x[1].get("start_time", 0),
@@ -127,11 +156,8 @@ def get_job_list():
127
  query = job_info.get("query", "")
128
  start_time = job_info.get("start_time", 0)
129
  time_str = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
130
-
131
- # Create a shortened query preview
132
  query_preview = query[:30] + "..." if query and len(query) > 30 else query or "N/A"
133
 
134
- # Color-code the status display
135
  if status == "processing":
136
  status_formatted = f"<span style='color: red'>⏳ {status}</span>"
137
  elif status == "completed":
@@ -148,33 +174,14 @@ def get_job_list():
148
 
149
  def get_sheet_data():
150
  global sheet_data
151
- global file_name
152
- global sheet
153
- file = file_name
154
- sheet_name = sheet
155
- print ("file name: ",file," sheet name: ",sheet_name," ")
156
-
157
- if sheet_data is None:
158
- try:
159
- df = pd.read_excel(file.name, sheet_name=sheet_name)
160
- sheet_data = df.to_string(index=False) # Convert sheet data to string format
161
- return sheet_data # Display sheet data in UI
162
- except Exception as e:
163
- return f"Error reading sheet: {str(e)}"
164
- else:
165
- return sheet_data
166
-
167
- # Assuming process_in_background is using threading to call process_query
168
 
169
  def process_in_background(job_id, func, args):
170
- """Runs a function in the background and stores its result in a shared queue."""
171
  result = func(*args)
172
  results_queue.put((job_id, result))
173
  debug_print(f"Job {job_id} finished processing in background.")
174
 
175
-
176
  def submit_query_async(query, model_choice=None):
177
- """Asynchronous version of submit_query_updated to prevent timeouts."""
178
  global last_job_id
179
  global sheet_data
180
 
@@ -184,8 +191,6 @@ def submit_query_async(query, model_choice=None):
184
  job_id = str(uuid.uuid4())
185
  debug_print(f"Starting async job {job_id} for query: {query}")
186
 
187
-
188
- # Start background thread to process the query
189
  threading.Thread(
190
  target=process_in_background,
191
  args=(job_id, process_query, [query, model_choice or "Mistral-API"])
@@ -207,9 +212,9 @@ def submit_query_async(query, model_choice=None):
207
  f"Job ID: {job_id}",
208
  f"Input tokens: {count_tokens(query)}",
209
  "Output tokens: pending",
210
- job_id, # For UI job id update
211
- query, # For UI query display update
212
- get_job_list() # Updated job list
213
  )
214
 
215
  def job_selected(job_id):
@@ -228,7 +233,6 @@ def check_job_status(job_id):
228
  html_response = "<div style='font-family: monospace;'><p>Please enter a job ID.</p></div>"
229
  return html_response, "", "", "", ""
230
 
231
- # Process any completed jobs in the results queue
232
  try:
233
  while not results_queue.empty():
234
  completed_id, result = results_queue.get_nowait()
@@ -287,7 +291,6 @@ def cleanup_old_jobs():
287
  to_delete = []
288
 
289
  for job_id, job in jobs.items():
290
- # Completed jobs older than 24 hours and processing jobs older than 48 hours will be removed.
291
  if job["status"] == "completed" and (current_time - job.get("end_time", 0)) > 86400:
292
  to_delete.append(job_id)
293
  elif job["status"] == "processing" and (current_time - job.get("start_time", 0)) > 172800:
@@ -301,10 +304,8 @@ def cleanup_old_jobs():
301
 
302
  # Function to run query (dummy function)
303
  def run_query(max_value):
304
- # Simulate a data retrieval or processing function
305
  return [[i, i**2] for i in range(1, max_value + 1)]
306
 
307
- # Function to call both refresh_job_list and check_job_status using the last job ID
308
  def periodic_update(is_checked):
309
  interval = 3 if is_checked else None
310
  debug_print(f"Auto-refresh checkbox is {'checked' if is_checked else 'unchecked'}, every={interval}")
@@ -312,111 +313,46 @@ def periodic_update(is_checked):
312
  global last_job_id
313
  job_list_md = refresh_job_list()
314
  job_status = check_job_status(last_job_id) if last_job_id else ("No job ID available", "", "", "", "")
315
-
316
- # Extract plain text from HTML for status_text
317
  from bs4 import BeautifulSoup
318
  html_content = job_status[0]
319
  plain_text = ""
320
  if html_content:
321
  soup = BeautifulSoup(html_content, "html.parser")
322
  plain_text = soup.get_text()
323
-
324
- # Return all expected outputs, including status_text
325
  return job_list_md, job_status[0], plain_text, job_status[1], job_status[2], job_status[3], job_status[4]
326
  else:
327
- # Return empty values to stop updates - make sure to match the number of expected outputs
328
  return "", "", "", "", "", "", ""
329
 
330
-
331
- # Add email sending function
332
- def send_email(email_address, content, is_formatted=True):
333
- if not email_address or "@" not in email_address:
334
- return "Please enter a valid email address"
335
-
336
- try:
337
- creds = get_gmail_credentials()
338
- service = build("gmail", "v1", credentials=creds)
339
-
340
- # Create email message with appropriate MIME type
341
- msg = MIMEMultipart()
342
- msg["to"] = email_address
343
- msg["subject"] = "Scouting AI Report"
344
- msg.attach(MIMEText(content, "html" if is_formatted else "plain"))
345
-
346
- # Encode email message in base64
347
- encoded_msg = base64.urlsafe_b64encode(msg.as_bytes()).decode()
348
- send_message = {"raw": encoded_msg}
349
-
350
- # Send email using Gmail API
351
- service.users().messages().send(userId="me", body=send_message).execute()
352
- return "Email sent successfully via Gmail API!"
353
-
354
- except Exception as e:
355
- return f"Failed to send email: {str(e)}"
356
-
357
- # Function to copy content to clipboard
358
- def copy_to_clipboard(content):
359
- import pyperclip
360
- pyperclip.copy(content)
361
- return "Copied to clipboard!"
362
-
363
-
364
- # Function to convert HTML to plain text using BeautifulSoup
365
- def copy_plain_text(html_content):
366
- try:
367
- from bs4 import BeautifulSoup
368
- except ImportError:
369
- return "Error: BeautifulSoup is required to convert HTML to plain text. Please install it."
370
- soup = BeautifulSoup(html_content, "html.parser")
371
- plain_text = soup.get_text()
372
- import pyperclip
373
- pyperclip.copy(plain_text)
374
-
375
- return "Copied to clipboard!"
376
-
377
-
378
- # Default prompt template
379
- default_prompt = (
380
- "you are a scouter and play against this player with this stats. "
381
- "Make an scouting report for head coach with weaknesses and strength, and present strategy to stop his strength "
382
- "and explore his weaknesses acoording with this stats, make easily to read combine strength with strategy to stop "
383
- "and weaknesses with explore and in the final of the raport Key points of emphesize. Use html to output the image and dark color backgrounds (pallette dark green, dark red, etc.) for he different sections of the formatted output. "
384
- )
385
-
386
  # ------------------------------
387
  # Gradio UI Layout: Scouting AI App
388
  # ------------------------------
389
 
390
  with gr.Blocks() as app:
391
  # App Title and Description
392
- gr.Markdown("## PDF 2 TXT")
393
- gr.Markdown("Welcome to the PDF conversion App.")
394
 
395
- # Two-column layout for top section (File Load and Job Information)
396
  with gr.Row():
397
  # Left Column: File Load Section (50% width)
398
  with gr.Column(scale=1):
399
  gr.Markdown("### πŸ“ Load File Section")
400
- gr.Markdown("Upload your **.pdf** file below, specify the sheet name, and click *Load File* to process your file.")
401
  file_input = gr.File(label="Upload .pdf File")
402
  page_start_input_file = gr.Textbox(label="Page Start")
403
  page_end_input_file = gr.Textbox(label="Page End")
404
  load_button_file = gr.Button("Load File")
405
- sheet_output_file = gr.Textbox(label="Pages", interactive=False)
406
 
407
  # Right Column: Job Information Section (50% width)
408
  with gr.Column(scale=1):
409
  gr.Markdown("### πŸ“Š Job Information")
410
  gr.Markdown("View all submitted jobs, refresh the list, and check the status of individual jobs.")
411
-
412
- # Fixed-height job list with scrollbar
413
  job_list_display = gr.Markdown(
414
  get_job_list(),
415
  elem_id="job-list-display",
416
  elem_classes=["scrollable-job-list"]
417
  )
418
-
419
- # Add CSS for scrollable job list
420
  gr.HTML("""
421
  <style>
422
  .scrollable-job-list {
@@ -428,57 +364,49 @@ with gr.Blocks() as app:
428
  }
429
  </style>
430
  """)
431
-
432
  refresh_button = gr.Button("Refresh Job List")
433
-
434
  gr.Markdown("#### πŸ” Check Job Status")
435
  job_id_input = gr.Textbox(label="Enter Job ID")
436
  check_status_button = gr.Button("Check Job Status")
437
-
438
- # Cleaning Task Section (left column, below File Load)
439
  with gr.Row():
440
- # Left Column: Submit Query Section
441
  with gr.Column(scale=1):
442
- gr.Markdown("### Cleaning Tasks")
443
- with gr.Row():
444
- auto_refresh_checkbox = gr.Checkbox(
445
- label="Enable Auto Refresh",
446
- value=False # Default to unchecked
447
- )
448
- submit_button = gr.Button("Submit Cleaning Task ")
449
- # Use a Checkbox to control the periodic updates
450
-
451
- # Submit Query Section (left column, below Cleaning Tasks)
 
 
 
452
  with gr.Row():
453
- # Left Column: Submit Query Section
454
  with gr.Column(scale=1):
455
  gr.Markdown("### πŸš€ Submit Query")
456
  gr.Markdown("Enter your prompt below and choose a model. Your query will be processed in the background.")
457
  model_dropdown = gr.Dropdown(
458
  choices=["πŸ‡ΊπŸ‡Έ Remote Meta-Llama-3", "πŸ‡ͺπŸ‡Ί Mistral-API"],
459
- value="πŸ‡ͺπŸ‡Ί Mistral-API", # Default model set to Mistral
460
  label="Select Model"
461
  )
462
- prompt_input = gr.Textbox(label="Enter your prompt", value=default_prompt, lines=6)
463
  with gr.Row():
464
- auto_refresh_checkbox = gr.Checkbox(
465
  label="Enable Auto Refresh",
466
- value=False # Default to unchecked
467
  )
468
- submit_button = gr.Button("Submit Query ")
469
- # Use a Checkbox to control the periodic updates
470
-
471
- # Add a textarea to store the plain text version for copying
472
- status_text = gr.Textbox(label="Response Text ", visible=True)
473
-
474
  response_output = gr.Textbox(label="Response", interactive=False)
475
  token_info = gr.Textbox(label="Token Info", interactive=False)
476
-
477
- # Job Status Output in right column
478
  with gr.Column(scale=1):
479
- # Change Job Status output to an HTML component for proper formatting
480
  status_output = gr.HTML(label="Job Status", interactive=False)
481
-
482
  job_id_display = gr.Textbox(label="Job ID", interactive=False)
483
  input_tokens_display = gr.Textbox(label="Input Tokens", interactive=False)
484
  output_tokens_display = gr.Textbox(label="Output Tokens", interactive=False)
@@ -488,32 +416,39 @@ with gr.Blocks() as app:
488
  # Set up interactions
489
  # ------------------------------
490
 
491
- # Load file interaction (dummy function for now)
492
- def load_file(file, sheet_name):
493
- global sheet_data
494
- global file_name
495
- global sheet
496
  file_name = file
497
- sheet = sheet_name
498
-
499
- if file is None or sheet_name.strip() == "":
500
- return "Please upload a file and enter a valid sheet name."
501
-
502
  try:
503
- df = pd.read_excel(file.name, sheet_name=sheet_name)
504
- sheet_data = df.to_string(index=False) # Convert sheet data to string format
505
- return sheet_data # Display sheet data in UI
 
 
 
 
 
 
506
  except Exception as e:
507
- return f"Error reading sheet: {str(e)}"
508
 
509
  load_button_file.click(
510
  fn=load_file,
511
- inputs=[file_input, sheet_input_file],
512
  outputs=sheet_output_file
513
  )
514
 
515
- # When submitting a query asynchronously
516
- submit_button.click(
 
 
 
 
 
 
517
  fn=submit_query_async,
518
  inputs=[prompt_input, model_dropdown],
519
  outputs=[
@@ -523,7 +458,6 @@ with gr.Blocks() as app:
523
  ]
524
  )
525
 
526
- # Check job status interaction
527
  check_status_button.click(
528
  fn=check_job_status,
529
  inputs=[job_id_input],
@@ -531,28 +465,19 @@ with gr.Blocks() as app:
531
  output_tokens_display, job_query_display]
532
  )
533
 
534
- # Refresh the job list
535
  refresh_button.click(
536
  fn=refresh_job_list,
537
  inputs=[],
538
  outputs=job_list_display
539
  )
540
 
541
- # Use the Checkbox to control the periodic updates
542
- auto_refresh_checkbox.change(
543
  fn=periodic_update,
544
- inputs=[auto_refresh_checkbox],
545
  outputs=[job_list_display, status_output, status_text, job_id_display, input_tokens_display, output_tokens_display, job_query_display],
546
  every=3
547
  )
548
 
549
-
550
- # Connect the copy button to show the text in the textbox and make it visible temporarily
551
- def show_copy_text(text):
552
- # Simply return the text value and make the component visible
553
- return gr.update(value=text, visible=True)
554
-
555
-
556
  if __name__ == "__main__":
557
  debug_print("Launching Gradio UI...")
558
  app.queue().launch(share=False)
 
8
  import uuid
9
  import queue
10
  import time
11
+ import fitz # PyMuPDF for reading PDF files
12
  from transformers import AutoTokenizer
13
  from mistralai import Mistral
14
  from huggingface_hub import InferenceClient
15
 
 
16
  # ------------------------------
17
  # Helper functions and globals
18
  # ------------------------------
19
  sheet_data = None
20
  file_name = None
 
21
 
22
  def debug_print(message: str):
23
  print(f"[{datetime.datetime.now().isoformat()}] {message}", flush=True)
 
40
  return len(text.split())
41
 
42
  def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
43
+ full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}" # Append loaded text to prompt
44
 
45
  if "Mistral" in model_name:
46
  mistral_api_key = os.getenv("MISTRAL_API_KEY")
 
72
  else:
73
  raise ValueError("Invalid model selection. Please choose either 'Mistral-API' or 'Meta-Llama-3'.")
74
 
 
75
  def process_query(prompt: str, model_name: str):
76
  global sheet_data
77
 
 
78
  if sheet_data is None:
79
  sheet_data = get_sheet_data()
80
 
81
+ full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}"
82
  debug_print(f"Processing query with model {model_name}: {full_prompt}")
83
 
 
84
  response = generate_response(prompt, model_name, sheet_data)
85
+ input_tokens = count_tokens(prompt + "\n\n" + sheet_data)
 
 
86
  output_tokens = count_tokens(response)
87
 
 
88
  return response, f"Input tokens: {input_tokens}", f"Output tokens: {output_tokens}"
89
 
90
  def ui_process_query(prompt, model_name):
91
  return process_query(prompt, model_name)
92
 
93
+ # ------------------------------
94
+ # Cleaning Functions
95
+ # ------------------------------
96
+
97
+ def clean_text(text: str, remove_spaces: bool, remove_headers_footers: bool, lowercase: bool, remove_special: bool) -> str:
98
+ """
99
+ Cleans the given text based on the provided options.
100
+ """
101
+ # Remove extra spaces & newlines
102
+ if remove_spaces:
103
+ text = re.sub(r'\s+', ' ', text).strip()
104
+
105
+ # Remove headers/footers: a simple heuristic to remove lines that repeat
106
+ if remove_headers_footers:
107
+ lines = text.split('\n')
108
+ freq = {}
109
+ for line in lines:
110
+ line_stripped = line.strip()
111
+ if line_stripped:
112
+ freq[line] = freq.get(line, 0) + 1
113
+ lines = [line for line in lines if freq.get(line, 0) <= 1]
114
+ text = "\n".join(lines)
115
+
116
+ if lowercase:
117
+ text = text.lower()
118
+
119
+ if remove_special:
120
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
121
+
122
+ return text
123
+
124
+ def execute_cleaning(text: str, remove_spaces: bool, remove_headers: bool, lowercase: bool, remove_special: bool) -> str:
125
+ if not text or text.strip() == "":
126
+ return "No text available for cleaning."
127
+ cleaned = clean_text(text, remove_spaces, remove_headers, lowercase, remove_special)
128
+ return cleaned
129
+
130
  # ------------------------------
131
  # Global variables for background jobs
132
  # ------------------------------
 
144
  if not jobs:
145
  return "No jobs found. Submit a query or load files to create jobs."
146
 
 
147
  sorted_jobs = sorted(
148
  [(job_id, job_info) for job_id, job_info in jobs.items()],
149
  key=lambda x: x[1].get("start_time", 0),
 
156
  query = job_info.get("query", "")
157
  start_time = job_info.get("start_time", 0)
158
  time_str = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
 
 
159
  query_preview = query[:30] + "..." if query and len(query) > 30 else query or "N/A"
160
 
 
161
  if status == "processing":
162
  status_formatted = f"<span style='color: red'>⏳ {status}</span>"
163
  elif status == "completed":
 
174
 
175
  def get_sheet_data():
176
  global sheet_data
177
+ return sheet_data if sheet_data else "No data loaded."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  def process_in_background(job_id, func, args):
 
180
  result = func(*args)
181
  results_queue.put((job_id, result))
182
  debug_print(f"Job {job_id} finished processing in background.")
183
 
 
184
  def submit_query_async(query, model_choice=None):
 
185
  global last_job_id
186
  global sheet_data
187
 
 
191
  job_id = str(uuid.uuid4())
192
  debug_print(f"Starting async job {job_id} for query: {query}")
193
 
 
 
194
  threading.Thread(
195
  target=process_in_background,
196
  args=(job_id, process_query, [query, model_choice or "Mistral-API"])
 
212
  f"Job ID: {job_id}",
213
  f"Input tokens: {count_tokens(query)}",
214
  "Output tokens: pending",
215
+ job_id,
216
+ query,
217
+ get_job_list()
218
  )
219
 
220
  def job_selected(job_id):
 
233
  html_response = "<div style='font-family: monospace;'><p>Please enter a job ID.</p></div>"
234
  return html_response, "", "", "", ""
235
 
 
236
  try:
237
  while not results_queue.empty():
238
  completed_id, result = results_queue.get_nowait()
 
291
  to_delete = []
292
 
293
  for job_id, job in jobs.items():
 
294
  if job["status"] == "completed" and (current_time - job.get("end_time", 0)) > 86400:
295
  to_delete.append(job_id)
296
  elif job["status"] == "processing" and (current_time - job.get("start_time", 0)) > 172800:
 
304
 
305
  # Function to run query (dummy function)
306
  def run_query(max_value):
 
307
  return [[i, i**2] for i in range(1, max_value + 1)]
308
 
 
309
  def periodic_update(is_checked):
310
  interval = 3 if is_checked else None
311
  debug_print(f"Auto-refresh checkbox is {'checked' if is_checked else 'unchecked'}, every={interval}")
 
313
  global last_job_id
314
  job_list_md = refresh_job_list()
315
  job_status = check_job_status(last_job_id) if last_job_id else ("No job ID available", "", "", "", "")
 
 
316
  from bs4 import BeautifulSoup
317
  html_content = job_status[0]
318
  plain_text = ""
319
  if html_content:
320
  soup = BeautifulSoup(html_content, "html.parser")
321
  plain_text = soup.get_text()
 
 
322
  return job_list_md, job_status[0], plain_text, job_status[1], job_status[2], job_status[3], job_status[4]
323
  else:
 
324
  return "", "", "", "", "", "", ""
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  # ------------------------------
327
  # Gradio UI Layout: Scouting AI App
328
  # ------------------------------
329
 
330
  with gr.Blocks() as app:
331
  # App Title and Description
332
+ gr.Markdown("## πŸ“– PDF Conversion")
333
+ gr.Markdown("Text cleaning and processing tools.")
334
 
335
+ # Top section: File Load and Job Information (two columns)
336
  with gr.Row():
337
  # Left Column: File Load Section (50% width)
338
  with gr.Column(scale=1):
339
  gr.Markdown("### πŸ“ Load File Section")
340
+ gr.Markdown("Upload your **.pdf** file below and specify the page range to extract text.")
341
  file_input = gr.File(label="Upload .pdf File")
342
  page_start_input_file = gr.Textbox(label="Page Start")
343
  page_end_input_file = gr.Textbox(label="Page End")
344
  load_button_file = gr.Button("Load File")
345
+ sheet_output_file = gr.Textbox(label="Extracted Text", interactive=False)
346
 
347
  # Right Column: Job Information Section (50% width)
348
  with gr.Column(scale=1):
349
  gr.Markdown("### πŸ“Š Job Information")
350
  gr.Markdown("View all submitted jobs, refresh the list, and check the status of individual jobs.")
 
 
351
  job_list_display = gr.Markdown(
352
  get_job_list(),
353
  elem_id="job-list-display",
354
  elem_classes=["scrollable-job-list"]
355
  )
 
 
356
  gr.HTML("""
357
  <style>
358
  .scrollable-job-list {
 
364
  }
365
  </style>
366
  """)
 
367
  refresh_button = gr.Button("Refresh Job List")
 
368
  gr.Markdown("#### πŸ” Check Job Status")
369
  job_id_input = gr.Textbox(label="Enter Job ID")
370
  check_status_button = gr.Button("Check Job Status")
371
+
372
+ # New row: Cleaning Tasks placed in two equal columns under the load section
373
  with gr.Row():
374
+ # Left half: Cleaning Tasks checkboxes and Clean button
375
  with gr.Column(scale=1):
376
+ gr.Markdown("### Cleaning Options")
377
+ remove_spaces_checkbox = gr.Checkbox(label="Remove extra spaces & newlines: Clean unnecessary whitespace.", value=True)
378
+ remove_headers_checkbox = gr.Checkbox(label="Remove headers/footers: If repeated text appears on every page", value=False)
379
+ lowercase_checkbox = gr.Checkbox(label="Convert text to lowercase: For uniformity in text analysis.", value=False)
380
+ remove_special_checkbox = gr.Checkbox(label="Remove special characters: Useful for structured data extraction", value=False)
381
+ clean_button = gr.Button("Clean")
382
+
383
+
384
+ # Right half: Display Cleaned Text
385
+ with gr.Column(scale=1):
386
+ cleaned_output = gr.Textbox(label="Cleaned Text", interactive=False)
387
+
388
+ # Submit Query Section remains unchanged
389
  with gr.Row():
 
390
  with gr.Column(scale=1):
391
  gr.Markdown("### πŸš€ Submit Query")
392
  gr.Markdown("Enter your prompt below and choose a model. Your query will be processed in the background.")
393
  model_dropdown = gr.Dropdown(
394
  choices=["πŸ‡ΊπŸ‡Έ Remote Meta-Llama-3", "πŸ‡ͺπŸ‡Ί Mistral-API"],
395
+ value="πŸ‡ͺπŸ‡Ί Mistral-API",
396
  label="Select Model"
397
  )
398
+ prompt_input = gr.Textbox(label="Enter your prompt", value="", lines=6)
399
  with gr.Row():
400
+ auto_refresh_checkbox_query = gr.Checkbox(
401
  label="Enable Auto Refresh",
402
+ value=False
403
  )
404
+ submit_query_button = gr.Button("Submit Query")
405
+ status_text = gr.Textbox(label="Response Text", visible=True)
 
 
 
 
406
  response_output = gr.Textbox(label="Response", interactive=False)
407
  token_info = gr.Textbox(label="Token Info", interactive=False)
 
 
408
  with gr.Column(scale=1):
 
409
  status_output = gr.HTML(label="Job Status", interactive=False)
 
410
  job_id_display = gr.Textbox(label="Job ID", interactive=False)
411
  input_tokens_display = gr.Textbox(label="Input Tokens", interactive=False)
412
  output_tokens_display = gr.Textbox(label="Output Tokens", interactive=False)
 
416
  # Set up interactions
417
  # ------------------------------
418
 
419
+ # Updated Load file interaction: read PDF pages
420
+ def load_file(file, page_start, page_end):
421
+ global sheet_data, file_name
 
 
422
  file_name = file
423
+ if file is None or str(page_start).strip() == "" or str(page_end).strip() == "":
424
+ return "Please upload a file and enter valid page numbers."
 
 
 
425
  try:
426
+ doc = fitz.open(file.name)
427
+ ps = int(page_start)
428
+ pe = int(page_end)
429
+ text = ""
430
+ # Convert page numbers from 1-indexed to 0-indexed
431
+ for page_num in range(ps - 1, pe):
432
+ text += doc[page_num].get_text() + "\n"
433
+ sheet_data = text
434
+ return text
435
  except Exception as e:
436
+ return f"Error reading PDF: {str(e)}"
437
 
438
  load_button_file.click(
439
  fn=load_file,
440
+ inputs=[file_input, page_start_input_file, page_end_input_file],
441
  outputs=sheet_output_file
442
  )
443
 
444
+ # Cleaning button interaction: clean the loaded text using selected options.
445
+ clean_button.click(
446
+ fn=execute_cleaning,
447
+ inputs=[sheet_output_file, remove_spaces_checkbox, remove_headers_checkbox, lowercase_checkbox, remove_special_checkbox],
448
+ outputs=cleaned_output
449
+ )
450
+
451
+ submit_query_button.click(
452
  fn=submit_query_async,
453
  inputs=[prompt_input, model_dropdown],
454
  outputs=[
 
458
  ]
459
  )
460
 
 
461
  check_status_button.click(
462
  fn=check_job_status,
463
  inputs=[job_id_input],
 
465
  output_tokens_display, job_query_display]
466
  )
467
 
 
468
  refresh_button.click(
469
  fn=refresh_job_list,
470
  inputs=[],
471
  outputs=job_list_display
472
  )
473
 
474
+ auto_refresh_checkbox_query.change(
 
475
  fn=periodic_update,
476
+ inputs=[auto_refresh_checkbox_query],
477
  outputs=[job_list_display, status_output, status_text, job_id_display, input_tokens_display, output_tokens_display, job_query_display],
478
  every=3
479
  )
480
 
 
 
 
 
 
 
 
481
  if __name__ == "__main__":
482
  debug_print("Launching Gradio UI...")
483
  app.queue().launch(share=False)
requirements.txt CHANGED
@@ -41,3 +41,5 @@ pydantic==2.9.0
41
  sentence-transformers>=2.4.0
42
 
43
  mistralai==1.5.0
 
 
 
41
  sentence-transformers>=2.4.0
42
 
43
  mistralai==1.5.0
44
+
45
+ PyMuPDF