Spaces:
Sleeping
Sleeping
Update app.py
Browse filesadded batch processing
app.py
CHANGED
|
@@ -5,6 +5,8 @@ import re
|
|
| 5 |
from tokenizers import normalizers
|
| 6 |
from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
|
| 7 |
from tokenizers import Regex
|
|
|
|
|
|
|
| 8 |
|
| 9 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 10 |
|
|
@@ -58,6 +60,23 @@ tokenizer.backend_tokenizer.normalizer = Sequence([
|
|
| 58 |
Strip()
|
| 59 |
])
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def classify_text(text):
|
| 62 |
cleaned_text = clean_text(text)
|
| 63 |
if not text.strip():
|
|
@@ -100,7 +119,16 @@ def classify_text(text):
|
|
| 100 |
|
| 101 |
return result_message
|
| 102 |
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
|
|
@@ -232,12 +260,19 @@ with iface:
|
|
| 232 |
gr.Markdown(f"# {title}")
|
| 233 |
gr.Markdown(description)
|
| 234 |
text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
result_output = gr.Markdown("", elem_id="result_output_box")
|
| 236 |
text_input.change(classify_text, inputs=text_input, outputs=result_output)
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
with gr.Tab("Human text examples"):
|
| 240 |
-
gr.Examples(Human_texts, inputs=text_input)
|
| 241 |
gr.Markdown(bottom_text, elem_id="bottom_text")
|
| 242 |
|
| 243 |
iface.launch(share=True)
|
|
|
|
| 5 |
from tokenizers import normalizers
|
| 6 |
from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
|
| 7 |
from tokenizers import Regex
|
| 8 |
+
import fitz # PyMuPDF
|
| 9 |
+
import os
|
| 10 |
|
| 11 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 12 |
|
|
|
|
| 60 |
Strip()
|
| 61 |
])
|
| 62 |
|
| 63 |
+
def extract_text_from_file(file):
|
| 64 |
+
file_path = file.name
|
| 65 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 66 |
+
|
| 67 |
+
if ext == ".pdf":
|
| 68 |
+
doc = fitz.open(file_path)
|
| 69 |
+
text = ""
|
| 70 |
+
for page in doc:
|
| 71 |
+
text += page.get_text()
|
| 72 |
+
doc.close()
|
| 73 |
+
return text
|
| 74 |
+
elif ext == ".txt":
|
| 75 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 76 |
+
return f.read()
|
| 77 |
+
else:
|
| 78 |
+
return ""
|
| 79 |
+
|
| 80 |
def classify_text(text):
|
| 81 |
cleaned_text = clean_text(text)
|
| 82 |
if not text.strip():
|
|
|
|
| 119 |
|
| 120 |
return result_message
|
| 121 |
|
| 122 |
+
def batch_classify(files):
|
| 123 |
+
results = []
|
| 124 |
+
for file in files:
|
| 125 |
+
try:
|
| 126 |
+
text = extract_text_from_file(file)
|
| 127 |
+
result_html = classify_text(text)
|
| 128 |
+
results.append([os.path.basename(file.name), result_html])
|
| 129 |
+
except Exception as e:
|
| 130 |
+
results.append([os.path.basename(file.name), f"Error: {str(e)}"])
|
| 131 |
+
return results
|
| 132 |
|
| 133 |
|
| 134 |
|
|
|
|
| 260 |
gr.Markdown(f"# {title}")
|
| 261 |
gr.Markdown(description)
|
| 262 |
text_input = gr.Textbox(label="", placeholder="Type or paste your content here...", elem_id="text_input_box", lines=5)
|
| 263 |
+
|
| 264 |
+
gr.Markdown("## Or upload multiple files for batch classification")
|
| 265 |
+
file_input = gr.File(label="Upload PDF or Text Files", file_types=[".pdf", ".txt", ".py", ".ipynb"], file_count="multiple")
|
| 266 |
+
result_table = gr.Dataframe(headers=["File Name", "Classification Result"], wrap=True)
|
| 267 |
+
|
| 268 |
+
file_input.change(fn=batch_classify, inputs=file_input, outputs=result_table)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
result_output = gr.Markdown("", elem_id="result_output_box")
|
| 273 |
text_input.change(classify_text, inputs=text_input, outputs=result_output)
|
| 274 |
+
|
| 275 |
+
|
|
|
|
|
|
|
| 276 |
gr.Markdown(bottom_text, elem_id="bottom_text")
|
| 277 |
|
| 278 |
iface.launch(share=True)
|