alxd commited on
Commit Β·
50ffeff
1
Parent(s): b7811cf
basic cleaning tasks
Browse files- pdf2txt.py +98 -173
- requirements.txt +2 -0
pdf2txt.py
CHANGED
|
@@ -8,17 +8,16 @@ import threading
|
|
| 8 |
import uuid
|
| 9 |
import queue
|
| 10 |
import time
|
|
|
|
| 11 |
from transformers import AutoTokenizer
|
| 12 |
from mistralai import Mistral
|
| 13 |
from huggingface_hub import InferenceClient
|
| 14 |
|
| 15 |
-
|
| 16 |
# ------------------------------
|
| 17 |
# Helper functions and globals
|
| 18 |
# ------------------------------
|
| 19 |
sheet_data = None
|
| 20 |
file_name = None
|
| 21 |
-
sheet = None
|
| 22 |
|
| 23 |
def debug_print(message: str):
|
| 24 |
print(f"[{datetime.datetime.now().isoformat()}] {message}", flush=True)
|
|
@@ -41,7 +40,7 @@ def count_tokens(text: str) -> int:
|
|
| 41 |
return len(text.split())
|
| 42 |
|
| 43 |
def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
|
| 44 |
-
full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}" # Append
|
| 45 |
|
| 46 |
if "Mistral" in model_name:
|
| 47 |
mistral_api_key = os.getenv("MISTRAL_API_KEY")
|
|
@@ -73,30 +72,61 @@ def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
|
|
| 73 |
else:
|
| 74 |
raise ValueError("Invalid model selection. Please choose either 'Mistral-API' or 'Meta-Llama-3'.")
|
| 75 |
|
| 76 |
-
|
| 77 |
def process_query(prompt: str, model_name: str):
|
| 78 |
global sheet_data
|
| 79 |
|
| 80 |
-
# Handle the case where sheet_data might be None
|
| 81 |
if sheet_data is None:
|
| 82 |
sheet_data = get_sheet_data()
|
| 83 |
|
| 84 |
-
full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}"
|
| 85 |
debug_print(f"Processing query with model {model_name}: {full_prompt}")
|
| 86 |
|
| 87 |
-
# Generate the response using the specified model and sheet data
|
| 88 |
response = generate_response(prompt, model_name, sheet_data)
|
| 89 |
-
|
| 90 |
-
# Count the number of tokens for input and output
|
| 91 |
-
input_tokens = count_tokens(prompt + "\n\n" + sheet_data) # Include sheet data in the input token count
|
| 92 |
output_tokens = count_tokens(response)
|
| 93 |
|
| 94 |
-
# Return the response along with token counts
|
| 95 |
return response, f"Input tokens: {input_tokens}", f"Output tokens: {output_tokens}"
|
| 96 |
|
| 97 |
def ui_process_query(prompt, model_name):
|
| 98 |
return process_query(prompt, model_name)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# ------------------------------
|
| 101 |
# Global variables for background jobs
|
| 102 |
# ------------------------------
|
|
@@ -114,7 +144,6 @@ def get_job_list():
|
|
| 114 |
if not jobs:
|
| 115 |
return "No jobs found. Submit a query or load files to create jobs."
|
| 116 |
|
| 117 |
-
# Sort jobs by start time (newest first)
|
| 118 |
sorted_jobs = sorted(
|
| 119 |
[(job_id, job_info) for job_id, job_info in jobs.items()],
|
| 120 |
key=lambda x: x[1].get("start_time", 0),
|
|
@@ -127,11 +156,8 @@ def get_job_list():
|
|
| 127 |
query = job_info.get("query", "")
|
| 128 |
start_time = job_info.get("start_time", 0)
|
| 129 |
time_str = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
|
| 130 |
-
|
| 131 |
-
# Create a shortened query preview
|
| 132 |
query_preview = query[:30] + "..." if query and len(query) > 30 else query or "N/A"
|
| 133 |
|
| 134 |
-
# Color-code the status display
|
| 135 |
if status == "processing":
|
| 136 |
status_formatted = f"<span style='color: red'>β³ {status}</span>"
|
| 137 |
elif status == "completed":
|
|
@@ -148,33 +174,14 @@ def get_job_list():
|
|
| 148 |
|
| 149 |
def get_sheet_data():
|
| 150 |
global sheet_data
|
| 151 |
-
|
| 152 |
-
global sheet
|
| 153 |
-
file = file_name
|
| 154 |
-
sheet_name = sheet
|
| 155 |
-
print ("file name: ",file," sheet name: ",sheet_name," ")
|
| 156 |
-
|
| 157 |
-
if sheet_data is None:
|
| 158 |
-
try:
|
| 159 |
-
df = pd.read_excel(file.name, sheet_name=sheet_name)
|
| 160 |
-
sheet_data = df.to_string(index=False) # Convert sheet data to string format
|
| 161 |
-
return sheet_data # Display sheet data in UI
|
| 162 |
-
except Exception as e:
|
| 163 |
-
return f"Error reading sheet: {str(e)}"
|
| 164 |
-
else:
|
| 165 |
-
return sheet_data
|
| 166 |
-
|
| 167 |
-
# Assuming process_in_background is using threading to call process_query
|
| 168 |
|
| 169 |
def process_in_background(job_id, func, args):
|
| 170 |
-
"""Runs a function in the background and stores its result in a shared queue."""
|
| 171 |
result = func(*args)
|
| 172 |
results_queue.put((job_id, result))
|
| 173 |
debug_print(f"Job {job_id} finished processing in background.")
|
| 174 |
|
| 175 |
-
|
| 176 |
def submit_query_async(query, model_choice=None):
|
| 177 |
-
"""Asynchronous version of submit_query_updated to prevent timeouts."""
|
| 178 |
global last_job_id
|
| 179 |
global sheet_data
|
| 180 |
|
|
@@ -184,8 +191,6 @@ def submit_query_async(query, model_choice=None):
|
|
| 184 |
job_id = str(uuid.uuid4())
|
| 185 |
debug_print(f"Starting async job {job_id} for query: {query}")
|
| 186 |
|
| 187 |
-
|
| 188 |
-
# Start background thread to process the query
|
| 189 |
threading.Thread(
|
| 190 |
target=process_in_background,
|
| 191 |
args=(job_id, process_query, [query, model_choice or "Mistral-API"])
|
|
@@ -207,9 +212,9 @@ def submit_query_async(query, model_choice=None):
|
|
| 207 |
f"Job ID: {job_id}",
|
| 208 |
f"Input tokens: {count_tokens(query)}",
|
| 209 |
"Output tokens: pending",
|
| 210 |
-
job_id,
|
| 211 |
-
query,
|
| 212 |
-
get_job_list()
|
| 213 |
)
|
| 214 |
|
| 215 |
def job_selected(job_id):
|
|
@@ -228,7 +233,6 @@ def check_job_status(job_id):
|
|
| 228 |
html_response = "<div style='font-family: monospace;'><p>Please enter a job ID.</p></div>"
|
| 229 |
return html_response, "", "", "", ""
|
| 230 |
|
| 231 |
-
# Process any completed jobs in the results queue
|
| 232 |
try:
|
| 233 |
while not results_queue.empty():
|
| 234 |
completed_id, result = results_queue.get_nowait()
|
|
@@ -287,7 +291,6 @@ def cleanup_old_jobs():
|
|
| 287 |
to_delete = []
|
| 288 |
|
| 289 |
for job_id, job in jobs.items():
|
| 290 |
-
# Completed jobs older than 24 hours and processing jobs older than 48 hours will be removed.
|
| 291 |
if job["status"] == "completed" and (current_time - job.get("end_time", 0)) > 86400:
|
| 292 |
to_delete.append(job_id)
|
| 293 |
elif job["status"] == "processing" and (current_time - job.get("start_time", 0)) > 172800:
|
|
@@ -301,10 +304,8 @@ def cleanup_old_jobs():
|
|
| 301 |
|
| 302 |
# Function to run query (dummy function)
|
| 303 |
def run_query(max_value):
|
| 304 |
-
# Simulate a data retrieval or processing function
|
| 305 |
return [[i, i**2] for i in range(1, max_value + 1)]
|
| 306 |
|
| 307 |
-
# Function to call both refresh_job_list and check_job_status using the last job ID
|
| 308 |
def periodic_update(is_checked):
|
| 309 |
interval = 3 if is_checked else None
|
| 310 |
debug_print(f"Auto-refresh checkbox is {'checked' if is_checked else 'unchecked'}, every={interval}")
|
|
@@ -312,111 +313,46 @@ def periodic_update(is_checked):
|
|
| 312 |
global last_job_id
|
| 313 |
job_list_md = refresh_job_list()
|
| 314 |
job_status = check_job_status(last_job_id) if last_job_id else ("No job ID available", "", "", "", "")
|
| 315 |
-
|
| 316 |
-
# Extract plain text from HTML for status_text
|
| 317 |
from bs4 import BeautifulSoup
|
| 318 |
html_content = job_status[0]
|
| 319 |
plain_text = ""
|
| 320 |
if html_content:
|
| 321 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 322 |
plain_text = soup.get_text()
|
| 323 |
-
|
| 324 |
-
# Return all expected outputs, including status_text
|
| 325 |
return job_list_md, job_status[0], plain_text, job_status[1], job_status[2], job_status[3], job_status[4]
|
| 326 |
else:
|
| 327 |
-
# Return empty values to stop updates - make sure to match the number of expected outputs
|
| 328 |
return "", "", "", "", "", "", ""
|
| 329 |
|
| 330 |
-
|
| 331 |
-
# Add email sending function
|
| 332 |
-
def send_email(email_address, content, is_formatted=True):
|
| 333 |
-
if not email_address or "@" not in email_address:
|
| 334 |
-
return "Please enter a valid email address"
|
| 335 |
-
|
| 336 |
-
try:
|
| 337 |
-
creds = get_gmail_credentials()
|
| 338 |
-
service = build("gmail", "v1", credentials=creds)
|
| 339 |
-
|
| 340 |
-
# Create email message with appropriate MIME type
|
| 341 |
-
msg = MIMEMultipart()
|
| 342 |
-
msg["to"] = email_address
|
| 343 |
-
msg["subject"] = "Scouting AI Report"
|
| 344 |
-
msg.attach(MIMEText(content, "html" if is_formatted else "plain"))
|
| 345 |
-
|
| 346 |
-
# Encode email message in base64
|
| 347 |
-
encoded_msg = base64.urlsafe_b64encode(msg.as_bytes()).decode()
|
| 348 |
-
send_message = {"raw": encoded_msg}
|
| 349 |
-
|
| 350 |
-
# Send email using Gmail API
|
| 351 |
-
service.users().messages().send(userId="me", body=send_message).execute()
|
| 352 |
-
return "Email sent successfully via Gmail API!"
|
| 353 |
-
|
| 354 |
-
except Exception as e:
|
| 355 |
-
return f"Failed to send email: {str(e)}"
|
| 356 |
-
|
| 357 |
-
# Function to copy content to clipboard
|
| 358 |
-
def copy_to_clipboard(content):
|
| 359 |
-
import pyperclip
|
| 360 |
-
pyperclip.copy(content)
|
| 361 |
-
return "Copied to clipboard!"
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
# Function to convert HTML to plain text using BeautifulSoup
|
| 365 |
-
def copy_plain_text(html_content):
|
| 366 |
-
try:
|
| 367 |
-
from bs4 import BeautifulSoup
|
| 368 |
-
except ImportError:
|
| 369 |
-
return "Error: BeautifulSoup is required to convert HTML to plain text. Please install it."
|
| 370 |
-
soup = BeautifulSoup(html_content, "html.parser")
|
| 371 |
-
plain_text = soup.get_text()
|
| 372 |
-
import pyperclip
|
| 373 |
-
pyperclip.copy(plain_text)
|
| 374 |
-
|
| 375 |
-
return "Copied to clipboard!"
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
# Default prompt template
|
| 379 |
-
default_prompt = (
|
| 380 |
-
"you are a scouter and play against this player with this stats. "
|
| 381 |
-
"Make an scouting report for head coach with weaknesses and strength, and present strategy to stop his strength "
|
| 382 |
-
"and explore his weaknesses acoording with this stats, make easily to read combine strength with strategy to stop "
|
| 383 |
-
"and weaknesses with explore and in the final of the raport Key points of emphesize. Use html to output the image and dark color backgrounds (pallette dark green, dark red, etc.) for he different sections of the formatted output. "
|
| 384 |
-
)
|
| 385 |
-
|
| 386 |
# ------------------------------
|
| 387 |
# Gradio UI Layout: Scouting AI App
|
| 388 |
# ------------------------------
|
| 389 |
|
| 390 |
with gr.Blocks() as app:
|
| 391 |
# App Title and Description
|
| 392 |
-
gr.Markdown("## PDF
|
| 393 |
-
gr.Markdown("
|
| 394 |
|
| 395 |
-
#
|
| 396 |
with gr.Row():
|
| 397 |
# Left Column: File Load Section (50% width)
|
| 398 |
with gr.Column(scale=1):
|
| 399 |
gr.Markdown("### π Load File Section")
|
| 400 |
-
gr.Markdown("Upload your **.pdf** file below
|
| 401 |
file_input = gr.File(label="Upload .pdf File")
|
| 402 |
page_start_input_file = gr.Textbox(label="Page Start")
|
| 403 |
page_end_input_file = gr.Textbox(label="Page End")
|
| 404 |
load_button_file = gr.Button("Load File")
|
| 405 |
-
sheet_output_file = gr.Textbox(label="
|
| 406 |
|
| 407 |
# Right Column: Job Information Section (50% width)
|
| 408 |
with gr.Column(scale=1):
|
| 409 |
gr.Markdown("### π Job Information")
|
| 410 |
gr.Markdown("View all submitted jobs, refresh the list, and check the status of individual jobs.")
|
| 411 |
-
|
| 412 |
-
# Fixed-height job list with scrollbar
|
| 413 |
job_list_display = gr.Markdown(
|
| 414 |
get_job_list(),
|
| 415 |
elem_id="job-list-display",
|
| 416 |
elem_classes=["scrollable-job-list"]
|
| 417 |
)
|
| 418 |
-
|
| 419 |
-
# Add CSS for scrollable job list
|
| 420 |
gr.HTML("""
|
| 421 |
<style>
|
| 422 |
.scrollable-job-list {
|
|
@@ -428,57 +364,49 @@ with gr.Blocks() as app:
|
|
| 428 |
}
|
| 429 |
</style>
|
| 430 |
""")
|
| 431 |
-
|
| 432 |
refresh_button = gr.Button("Refresh Job List")
|
| 433 |
-
|
| 434 |
gr.Markdown("#### π Check Job Status")
|
| 435 |
job_id_input = gr.Textbox(label="Enter Job ID")
|
| 436 |
check_status_button = gr.Button("Check Job Status")
|
| 437 |
-
|
| 438 |
-
# Cleaning
|
| 439 |
with gr.Row():
|
| 440 |
-
# Left
|
| 441 |
with gr.Column(scale=1):
|
| 442 |
-
gr.Markdown("### Cleaning
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
| 452 |
with gr.Row():
|
| 453 |
-
# Left Column: Submit Query Section
|
| 454 |
with gr.Column(scale=1):
|
| 455 |
gr.Markdown("### π Submit Query")
|
| 456 |
gr.Markdown("Enter your prompt below and choose a model. Your query will be processed in the background.")
|
| 457 |
model_dropdown = gr.Dropdown(
|
| 458 |
choices=["πΊπΈ Remote Meta-Llama-3", "πͺπΊ Mistral-API"],
|
| 459 |
-
value="πͺπΊ Mistral-API",
|
| 460 |
label="Select Model"
|
| 461 |
)
|
| 462 |
-
prompt_input = gr.Textbox(label="Enter your prompt", value=
|
| 463 |
with gr.Row():
|
| 464 |
-
|
| 465 |
label="Enable Auto Refresh",
|
| 466 |
-
value=False
|
| 467 |
)
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
# Add a textarea to store the plain text version for copying
|
| 472 |
-
status_text = gr.Textbox(label="Response Text ", visible=True)
|
| 473 |
-
|
| 474 |
response_output = gr.Textbox(label="Response", interactive=False)
|
| 475 |
token_info = gr.Textbox(label="Token Info", interactive=False)
|
| 476 |
-
|
| 477 |
-
# Job Status Output in right column
|
| 478 |
with gr.Column(scale=1):
|
| 479 |
-
# Change Job Status output to an HTML component for proper formatting
|
| 480 |
status_output = gr.HTML(label="Job Status", interactive=False)
|
| 481 |
-
|
| 482 |
job_id_display = gr.Textbox(label="Job ID", interactive=False)
|
| 483 |
input_tokens_display = gr.Textbox(label="Input Tokens", interactive=False)
|
| 484 |
output_tokens_display = gr.Textbox(label="Output Tokens", interactive=False)
|
|
@@ -488,32 +416,39 @@ with gr.Blocks() as app:
|
|
| 488 |
# Set up interactions
|
| 489 |
# ------------------------------
|
| 490 |
|
| 491 |
-
# Load file interaction
|
| 492 |
-
def load_file(file,
|
| 493 |
-
global sheet_data
|
| 494 |
-
global file_name
|
| 495 |
-
global sheet
|
| 496 |
file_name = file
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
if file is None or sheet_name.strip() == "":
|
| 500 |
-
return "Please upload a file and enter a valid sheet name."
|
| 501 |
-
|
| 502 |
try:
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
except Exception as e:
|
| 507 |
-
return f"Error reading
|
| 508 |
|
| 509 |
load_button_file.click(
|
| 510 |
fn=load_file,
|
| 511 |
-
inputs=[file_input,
|
| 512 |
outputs=sheet_output_file
|
| 513 |
)
|
| 514 |
|
| 515 |
-
#
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
fn=submit_query_async,
|
| 518 |
inputs=[prompt_input, model_dropdown],
|
| 519 |
outputs=[
|
|
@@ -523,7 +458,6 @@ with gr.Blocks() as app:
|
|
| 523 |
]
|
| 524 |
)
|
| 525 |
|
| 526 |
-
# Check job status interaction
|
| 527 |
check_status_button.click(
|
| 528 |
fn=check_job_status,
|
| 529 |
inputs=[job_id_input],
|
|
@@ -531,28 +465,19 @@ with gr.Blocks() as app:
|
|
| 531 |
output_tokens_display, job_query_display]
|
| 532 |
)
|
| 533 |
|
| 534 |
-
# Refresh the job list
|
| 535 |
refresh_button.click(
|
| 536 |
fn=refresh_job_list,
|
| 537 |
inputs=[],
|
| 538 |
outputs=job_list_display
|
| 539 |
)
|
| 540 |
|
| 541 |
-
|
| 542 |
-
auto_refresh_checkbox.change(
|
| 543 |
fn=periodic_update,
|
| 544 |
-
inputs=[
|
| 545 |
outputs=[job_list_display, status_output, status_text, job_id_display, input_tokens_display, output_tokens_display, job_query_display],
|
| 546 |
every=3
|
| 547 |
)
|
| 548 |
|
| 549 |
-
|
| 550 |
-
# Connect the copy button to show the text in the textbox and make it visible temporarily
|
| 551 |
-
def show_copy_text(text):
|
| 552 |
-
# Simply return the text value and make the component visible
|
| 553 |
-
return gr.update(value=text, visible=True)
|
| 554 |
-
|
| 555 |
-
|
| 556 |
if __name__ == "__main__":
|
| 557 |
debug_print("Launching Gradio UI...")
|
| 558 |
app.queue().launch(share=False)
|
|
|
|
| 8 |
import uuid
|
| 9 |
import queue
|
| 10 |
import time
|
| 11 |
+
import fitz # PyMuPDF for reading PDF files
|
| 12 |
from transformers import AutoTokenizer
|
| 13 |
from mistralai import Mistral
|
| 14 |
from huggingface_hub import InferenceClient
|
| 15 |
|
|
|
|
| 16 |
# ------------------------------
|
| 17 |
# Helper functions and globals
|
| 18 |
# ------------------------------
|
| 19 |
sheet_data = None
|
| 20 |
file_name = None
|
|
|
|
| 21 |
|
| 22 |
def debug_print(message: str):
|
| 23 |
print(f"[{datetime.datetime.now().isoformat()}] {message}", flush=True)
|
|
|
|
| 40 |
return len(text.split())
|
| 41 |
|
| 42 |
def generate_response(prompt: str, model_name: str, sheet_data: str) -> str:
|
| 43 |
+
full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}" # Append loaded text to prompt
|
| 44 |
|
| 45 |
if "Mistral" in model_name:
|
| 46 |
mistral_api_key = os.getenv("MISTRAL_API_KEY")
|
|
|
|
| 72 |
else:
|
| 73 |
raise ValueError("Invalid model selection. Please choose either 'Mistral-API' or 'Meta-Llama-3'.")
|
| 74 |
|
|
|
|
| 75 |
def process_query(prompt: str, model_name: str):
|
| 76 |
global sheet_data
|
| 77 |
|
|
|
|
| 78 |
if sheet_data is None:
|
| 79 |
sheet_data = get_sheet_data()
|
| 80 |
|
| 81 |
+
full_prompt = f"{prompt}\n\nSheet Data:\n{sheet_data}"
|
| 82 |
debug_print(f"Processing query with model {model_name}: {full_prompt}")
|
| 83 |
|
|
|
|
| 84 |
response = generate_response(prompt, model_name, sheet_data)
|
| 85 |
+
input_tokens = count_tokens(prompt + "\n\n" + sheet_data)
|
|
|
|
|
|
|
| 86 |
output_tokens = count_tokens(response)
|
| 87 |
|
|
|
|
| 88 |
return response, f"Input tokens: {input_tokens}", f"Output tokens: {output_tokens}"
|
| 89 |
|
| 90 |
def ui_process_query(prompt, model_name):
|
| 91 |
return process_query(prompt, model_name)
|
| 92 |
|
| 93 |
+
# ------------------------------
|
| 94 |
+
# Cleaning Functions
|
| 95 |
+
# ------------------------------
|
| 96 |
+
|
| 97 |
+
def clean_text(text: str, remove_spaces: bool, remove_headers_footers: bool, lowercase: bool, remove_special: bool) -> str:
|
| 98 |
+
"""
|
| 99 |
+
Cleans the given text based on the provided options.
|
| 100 |
+
"""
|
| 101 |
+
# Remove extra spaces & newlines
|
| 102 |
+
if remove_spaces:
|
| 103 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 104 |
+
|
| 105 |
+
# Remove headers/footers: a simple heuristic to remove lines that repeat
|
| 106 |
+
if remove_headers_footers:
|
| 107 |
+
lines = text.split('\n')
|
| 108 |
+
freq = {}
|
| 109 |
+
for line in lines:
|
| 110 |
+
line_stripped = line.strip()
|
| 111 |
+
if line_stripped:
|
| 112 |
+
freq[line] = freq.get(line, 0) + 1
|
| 113 |
+
lines = [line for line in lines if freq.get(line, 0) <= 1]
|
| 114 |
+
text = "\n".join(lines)
|
| 115 |
+
|
| 116 |
+
if lowercase:
|
| 117 |
+
text = text.lower()
|
| 118 |
+
|
| 119 |
+
if remove_special:
|
| 120 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
| 121 |
+
|
| 122 |
+
return text
|
| 123 |
+
|
| 124 |
+
def execute_cleaning(text: str, remove_spaces: bool, remove_headers: bool, lowercase: bool, remove_special: bool) -> str:
|
| 125 |
+
if not text or text.strip() == "":
|
| 126 |
+
return "No text available for cleaning."
|
| 127 |
+
cleaned = clean_text(text, remove_spaces, remove_headers, lowercase, remove_special)
|
| 128 |
+
return cleaned
|
| 129 |
+
|
| 130 |
# ------------------------------
|
| 131 |
# Global variables for background jobs
|
| 132 |
# ------------------------------
|
|
|
|
| 144 |
if not jobs:
|
| 145 |
return "No jobs found. Submit a query or load files to create jobs."
|
| 146 |
|
|
|
|
| 147 |
sorted_jobs = sorted(
|
| 148 |
[(job_id, job_info) for job_id, job_info in jobs.items()],
|
| 149 |
key=lambda x: x[1].get("start_time", 0),
|
|
|
|
| 156 |
query = job_info.get("query", "")
|
| 157 |
start_time = job_info.get("start_time", 0)
|
| 158 |
time_str = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
| 159 |
query_preview = query[:30] + "..." if query and len(query) > 30 else query or "N/A"
|
| 160 |
|
|
|
|
| 161 |
if status == "processing":
|
| 162 |
status_formatted = f"<span style='color: red'>β³ {status}</span>"
|
| 163 |
elif status == "completed":
|
|
|
|
| 174 |
|
| 175 |
def get_sheet_data():
|
| 176 |
global sheet_data
|
| 177 |
+
return sheet_data if sheet_data else "No data loaded."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
def process_in_background(job_id, func, args):
|
|
|
|
| 180 |
result = func(*args)
|
| 181 |
results_queue.put((job_id, result))
|
| 182 |
debug_print(f"Job {job_id} finished processing in background.")
|
| 183 |
|
|
|
|
| 184 |
def submit_query_async(query, model_choice=None):
|
|
|
|
| 185 |
global last_job_id
|
| 186 |
global sheet_data
|
| 187 |
|
|
|
|
| 191 |
job_id = str(uuid.uuid4())
|
| 192 |
debug_print(f"Starting async job {job_id} for query: {query}")
|
| 193 |
|
|
|
|
|
|
|
| 194 |
threading.Thread(
|
| 195 |
target=process_in_background,
|
| 196 |
args=(job_id, process_query, [query, model_choice or "Mistral-API"])
|
|
|
|
| 212 |
f"Job ID: {job_id}",
|
| 213 |
f"Input tokens: {count_tokens(query)}",
|
| 214 |
"Output tokens: pending",
|
| 215 |
+
job_id,
|
| 216 |
+
query,
|
| 217 |
+
get_job_list()
|
| 218 |
)
|
| 219 |
|
| 220 |
def job_selected(job_id):
|
|
|
|
| 233 |
html_response = "<div style='font-family: monospace;'><p>Please enter a job ID.</p></div>"
|
| 234 |
return html_response, "", "", "", ""
|
| 235 |
|
|
|
|
| 236 |
try:
|
| 237 |
while not results_queue.empty():
|
| 238 |
completed_id, result = results_queue.get_nowait()
|
|
|
|
| 291 |
to_delete = []
|
| 292 |
|
| 293 |
for job_id, job in jobs.items():
|
|
|
|
| 294 |
if job["status"] == "completed" and (current_time - job.get("end_time", 0)) > 86400:
|
| 295 |
to_delete.append(job_id)
|
| 296 |
elif job["status"] == "processing" and (current_time - job.get("start_time", 0)) > 172800:
|
|
|
|
| 304 |
|
| 305 |
# Function to run query (dummy function)
|
| 306 |
def run_query(max_value):
|
|
|
|
| 307 |
return [[i, i**2] for i in range(1, max_value + 1)]
|
| 308 |
|
|
|
|
| 309 |
def periodic_update(is_checked):
|
| 310 |
interval = 3 if is_checked else None
|
| 311 |
debug_print(f"Auto-refresh checkbox is {'checked' if is_checked else 'unchecked'}, every={interval}")
|
|
|
|
| 313 |
global last_job_id
|
| 314 |
job_list_md = refresh_job_list()
|
| 315 |
job_status = check_job_status(last_job_id) if last_job_id else ("No job ID available", "", "", "", "")
|
|
|
|
|
|
|
| 316 |
from bs4 import BeautifulSoup
|
| 317 |
html_content = job_status[0]
|
| 318 |
plain_text = ""
|
| 319 |
if html_content:
|
| 320 |
soup = BeautifulSoup(html_content, "html.parser")
|
| 321 |
plain_text = soup.get_text()
|
|
|
|
|
|
|
| 322 |
return job_list_md, job_status[0], plain_text, job_status[1], job_status[2], job_status[3], job_status[4]
|
| 323 |
else:
|
|
|
|
| 324 |
return "", "", "", "", "", "", ""
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# ------------------------------
|
| 327 |
# Gradio UI Layout: Scouting AI App
|
| 328 |
# ------------------------------
|
| 329 |
|
| 330 |
with gr.Blocks() as app:
|
| 331 |
# App Title and Description
|
| 332 |
+
gr.Markdown("## π PDF Conversion")
|
| 333 |
+
gr.Markdown("Text cleaning and processing tools.")
|
| 334 |
|
| 335 |
+
# Top section: File Load and Job Information (two columns)
|
| 336 |
with gr.Row():
|
| 337 |
# Left Column: File Load Section (50% width)
|
| 338 |
with gr.Column(scale=1):
|
| 339 |
gr.Markdown("### π Load File Section")
|
| 340 |
+
gr.Markdown("Upload your **.pdf** file below and specify the page range to extract text.")
|
| 341 |
file_input = gr.File(label="Upload .pdf File")
|
| 342 |
page_start_input_file = gr.Textbox(label="Page Start")
|
| 343 |
page_end_input_file = gr.Textbox(label="Page End")
|
| 344 |
load_button_file = gr.Button("Load File")
|
| 345 |
+
sheet_output_file = gr.Textbox(label="Extracted Text", interactive=False)
|
| 346 |
|
| 347 |
# Right Column: Job Information Section (50% width)
|
| 348 |
with gr.Column(scale=1):
|
| 349 |
gr.Markdown("### π Job Information")
|
| 350 |
gr.Markdown("View all submitted jobs, refresh the list, and check the status of individual jobs.")
|
|
|
|
|
|
|
| 351 |
job_list_display = gr.Markdown(
|
| 352 |
get_job_list(),
|
| 353 |
elem_id="job-list-display",
|
| 354 |
elem_classes=["scrollable-job-list"]
|
| 355 |
)
|
|
|
|
|
|
|
| 356 |
gr.HTML("""
|
| 357 |
<style>
|
| 358 |
.scrollable-job-list {
|
|
|
|
| 364 |
}
|
| 365 |
</style>
|
| 366 |
""")
|
|
|
|
| 367 |
refresh_button = gr.Button("Refresh Job List")
|
|
|
|
| 368 |
gr.Markdown("#### π Check Job Status")
|
| 369 |
job_id_input = gr.Textbox(label="Enter Job ID")
|
| 370 |
check_status_button = gr.Button("Check Job Status")
|
| 371 |
+
|
| 372 |
+
# New row: Cleaning Tasks placed in two equal columns under the load section
|
| 373 |
with gr.Row():
|
| 374 |
+
# Left half: Cleaning Tasks checkboxes and Clean button
|
| 375 |
with gr.Column(scale=1):
|
| 376 |
+
gr.Markdown("### Cleaning Options")
|
| 377 |
+
remove_spaces_checkbox = gr.Checkbox(label="Remove extra spaces & newlines: Clean unnecessary whitespace.", value=True)
|
| 378 |
+
remove_headers_checkbox = gr.Checkbox(label="Remove headers/footers: If repeated text appears on every page", value=False)
|
| 379 |
+
lowercase_checkbox = gr.Checkbox(label="Convert text to lowercase: For uniformity in text analysis.", value=False)
|
| 380 |
+
remove_special_checkbox = gr.Checkbox(label="Remove special characters: Useful for structured data extraction", value=False)
|
| 381 |
+
clean_button = gr.Button("Clean")
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# Right half: Display Cleaned Text
|
| 385 |
+
with gr.Column(scale=1):
|
| 386 |
+
cleaned_output = gr.Textbox(label="Cleaned Text", interactive=False)
|
| 387 |
+
|
| 388 |
+
# Submit Query Section remains unchanged
|
| 389 |
with gr.Row():
|
|
|
|
| 390 |
with gr.Column(scale=1):
|
| 391 |
gr.Markdown("### π Submit Query")
|
| 392 |
gr.Markdown("Enter your prompt below and choose a model. Your query will be processed in the background.")
|
| 393 |
model_dropdown = gr.Dropdown(
|
| 394 |
choices=["πΊπΈ Remote Meta-Llama-3", "πͺπΊ Mistral-API"],
|
| 395 |
+
value="πͺπΊ Mistral-API",
|
| 396 |
label="Select Model"
|
| 397 |
)
|
| 398 |
+
prompt_input = gr.Textbox(label="Enter your prompt", value="", lines=6)
|
| 399 |
with gr.Row():
|
| 400 |
+
auto_refresh_checkbox_query = gr.Checkbox(
|
| 401 |
label="Enable Auto Refresh",
|
| 402 |
+
value=False
|
| 403 |
)
|
| 404 |
+
submit_query_button = gr.Button("Submit Query")
|
| 405 |
+
status_text = gr.Textbox(label="Response Text", visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
response_output = gr.Textbox(label="Response", interactive=False)
|
| 407 |
token_info = gr.Textbox(label="Token Info", interactive=False)
|
|
|
|
|
|
|
| 408 |
with gr.Column(scale=1):
|
|
|
|
| 409 |
status_output = gr.HTML(label="Job Status", interactive=False)
|
|
|
|
| 410 |
job_id_display = gr.Textbox(label="Job ID", interactive=False)
|
| 411 |
input_tokens_display = gr.Textbox(label="Input Tokens", interactive=False)
|
| 412 |
output_tokens_display = gr.Textbox(label="Output Tokens", interactive=False)
|
|
|
|
| 416 |
# Set up interactions
|
| 417 |
# ------------------------------
|
| 418 |
|
| 419 |
+
# Updated Load file interaction: read PDF pages
|
| 420 |
+
def load_file(file, page_start, page_end):
|
| 421 |
+
global sheet_data, file_name
|
|
|
|
|
|
|
| 422 |
file_name = file
|
| 423 |
+
if file is None or str(page_start).strip() == "" or str(page_end).strip() == "":
|
| 424 |
+
return "Please upload a file and enter valid page numbers."
|
|
|
|
|
|
|
|
|
|
| 425 |
try:
|
| 426 |
+
doc = fitz.open(file.name)
|
| 427 |
+
ps = int(page_start)
|
| 428 |
+
pe = int(page_end)
|
| 429 |
+
text = ""
|
| 430 |
+
# Convert page numbers from 1-indexed to 0-indexed
|
| 431 |
+
for page_num in range(ps - 1, pe):
|
| 432 |
+
text += doc[page_num].get_text() + "\n"
|
| 433 |
+
sheet_data = text
|
| 434 |
+
return text
|
| 435 |
except Exception as e:
|
| 436 |
+
return f"Error reading PDF: {str(e)}"
|
| 437 |
|
| 438 |
load_button_file.click(
|
| 439 |
fn=load_file,
|
| 440 |
+
inputs=[file_input, page_start_input_file, page_end_input_file],
|
| 441 |
outputs=sheet_output_file
|
| 442 |
)
|
| 443 |
|
| 444 |
+
# Cleaning button interaction: clean the loaded text using selected options.
|
| 445 |
+
clean_button.click(
|
| 446 |
+
fn=execute_cleaning,
|
| 447 |
+
inputs=[sheet_output_file, remove_spaces_checkbox, remove_headers_checkbox, lowercase_checkbox, remove_special_checkbox],
|
| 448 |
+
outputs=cleaned_output
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
submit_query_button.click(
|
| 452 |
fn=submit_query_async,
|
| 453 |
inputs=[prompt_input, model_dropdown],
|
| 454 |
outputs=[
|
|
|
|
| 458 |
]
|
| 459 |
)
|
| 460 |
|
|
|
|
| 461 |
check_status_button.click(
|
| 462 |
fn=check_job_status,
|
| 463 |
inputs=[job_id_input],
|
|
|
|
| 465 |
output_tokens_display, job_query_display]
|
| 466 |
)
|
| 467 |
|
|
|
|
| 468 |
refresh_button.click(
|
| 469 |
fn=refresh_job_list,
|
| 470 |
inputs=[],
|
| 471 |
outputs=job_list_display
|
| 472 |
)
|
| 473 |
|
| 474 |
+
auto_refresh_checkbox_query.change(
|
|
|
|
| 475 |
fn=periodic_update,
|
| 476 |
+
inputs=[auto_refresh_checkbox_query],
|
| 477 |
outputs=[job_list_display, status_output, status_text, job_id_display, input_tokens_display, output_tokens_display, job_query_display],
|
| 478 |
every=3
|
| 479 |
)
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
if __name__ == "__main__":
|
| 482 |
debug_print("Launching Gradio UI...")
|
| 483 |
app.queue().launch(share=False)
|
requirements.txt
CHANGED
|
@@ -41,3 +41,5 @@ pydantic==2.9.0
|
|
| 41 |
sentence-transformers>=2.4.0
|
| 42 |
|
| 43 |
mistralai==1.5.0
|
|
|
|
|
|
|
|
|
| 41 |
sentence-transformers>=2.4.0
|
| 42 |
|
| 43 |
mistralai==1.5.0
|
| 44 |
+
|
| 45 |
+
PyMuPDF
|