Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,12 +11,11 @@ import subprocess
|
|
| 11 |
import re
|
| 12 |
|
| 13 |
# Initialize global state
|
| 14 |
-
contract_data = {} # In-memory repository
|
| 15 |
-
failed_records = []
|
| 16 |
processed_files = 0
|
| 17 |
total_files = 0
|
| 18 |
|
| 19 |
-
# Load pre-trained LayoutLMv3 model and tokenizer
|
| 20 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 21 |
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
|
| 22 |
|
|
@@ -50,24 +49,22 @@ def extract_text_from_pdf(pdf_bytes):
|
|
| 50 |
text = ""
|
| 51 |
for img in images:
|
| 52 |
text += pytesseract.image_to_string(img) + "\n"
|
| 53 |
-
print(f"OCR completed - Extracted text length: {len(text)}")
|
| 54 |
return text
|
| 55 |
except Exception as e:
|
| 56 |
-
print(f"OCR failed: {str(e)}")
|
| 57 |
return f"Error extracting text: {str(e)}"
|
| 58 |
finally:
|
| 59 |
if os.path.exists(temp_path):
|
| 60 |
os.unlink(temp_path)
|
| 61 |
|
| 62 |
def extract_key_data(text):
|
| 63 |
-
"""Extract key data (dates, amounts, clauses) using
|
| 64 |
dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
|
| 65 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
|
| 66 |
clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
|
| 67 |
return {"dates": dates, "amounts": amounts, "clauses": clauses}
|
| 68 |
|
| 69 |
def detect_risks(data):
|
| 70 |
-
"""
|
| 71 |
risks = []
|
| 72 |
if not data["dates"]:
|
| 73 |
risks.append("No expiration date detected - potential obligation risk.")
|
|
@@ -81,19 +78,19 @@ def process_contract(pdf_bytes, object_type):
|
|
| 81 |
total_files = 1
|
| 82 |
processed_files = 0
|
| 83 |
|
| 84 |
-
print(
|
| 85 |
text = extract_text_from_pdf(pdf_bytes)
|
| 86 |
if isinstance(text, str) and text.startswith("Error"):
|
| 87 |
return text, {}, [], "0/1"
|
| 88 |
|
| 89 |
-
print(
|
| 90 |
key_data = extract_key_data(text)
|
| 91 |
-
print(
|
| 92 |
risks = detect_risks(key_data)
|
| 93 |
status = "✅ Processed" if not risks else "⚠️ Processed with risks"
|
| 94 |
|
| 95 |
# Mock CLM integration with predefined fields
|
| 96 |
-
clm_fields = {"Name": "
|
| 97 |
clm_fields.update(key_data)
|
| 98 |
|
| 99 |
contract_id = f"Contract_{len(contract_data) + 1}"
|
|
@@ -117,21 +114,29 @@ def search_contracts(query):
|
|
| 117 |
# Gradio UI
|
| 118 |
with gr.Blocks(title="Contract Intelligence App") as demo:
|
| 119 |
with gr.Row():
|
| 120 |
-
file_input = gr.File(type="binary", file_types=["pdf"], file_count="
|
| 121 |
upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
|
| 122 |
|
| 123 |
object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
|
| 124 |
|
| 125 |
-
process_button = gr.Button("Process
|
| 126 |
status_output = gr.Textbox(label="Status", interactive=False)
|
| 127 |
extracted_data_output = gr.JSON(label="Extracted Data")
|
| 128 |
risks_output = gr.Textbox(label="Detected Risks", interactive=False)
|
| 129 |
|
| 130 |
-
def process_and_display(
|
| 131 |
-
if
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
process_button.click(
|
| 137 |
fn=process_and_display,
|
|
@@ -144,11 +149,8 @@ with gr.Blocks(title="Contract Intelligence App") as demo:
|
|
| 144 |
search_results = gr.JSON(label="Search Results")
|
| 145 |
search_button = gr.Button("Search")
|
| 146 |
|
| 147 |
-
def search_and_display(query):
|
| 148 |
-
return search_contracts(query)
|
| 149 |
-
|
| 150 |
search_button.click(
|
| 151 |
-
fn=
|
| 152 |
inputs=search_query,
|
| 153 |
outputs=search_results
|
| 154 |
)
|
|
|
|
| 11 |
import re
|
| 12 |
|
| 13 |
# Initialize global state
|
| 14 |
+
contract_data = {} # In-memory contract repository
|
|
|
|
| 15 |
processed_files = 0
|
| 16 |
total_files = 0
|
| 17 |
|
| 18 |
+
# Load pre-trained LayoutLMv3 model and tokenizer (placeholder for future fine-tuning)
|
| 19 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 20 |
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
|
| 21 |
|
|
|
|
| 49 |
text = ""
|
| 50 |
for img in images:
|
| 51 |
text += pytesseract.image_to_string(img) + "\n"
|
|
|
|
| 52 |
return text
|
| 53 |
except Exception as e:
|
|
|
|
| 54 |
return f"Error extracting text: {str(e)}"
|
| 55 |
finally:
|
| 56 |
if os.path.exists(temp_path):
|
| 57 |
os.unlink(temp_path)
|
| 58 |
|
| 59 |
def extract_key_data(text):
|
| 60 |
+
"""Extract key data (dates, amounts, clauses) using regex as a mock AI."""
|
| 61 |
dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
|
| 62 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
|
| 63 |
clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
|
| 64 |
return {"dates": dates, "amounts": amounts, "clauses": clauses}
|
| 65 |
|
| 66 |
def detect_risks(data):
|
| 67 |
+
"""Detect risks (e.g., missing dates, large amounts)."""
|
| 68 |
risks = []
|
| 69 |
if not data["dates"]:
|
| 70 |
risks.append("No expiration date detected - potential obligation risk.")
|
|
|
|
| 78 |
total_files = 1
|
| 79 |
processed_files = 0
|
| 80 |
|
| 81 |
+
print("Received file - Starting processing")
|
| 82 |
text = extract_text_from_pdf(pdf_bytes)
|
| 83 |
if isinstance(text, str) and text.startswith("Error"):
|
| 84 |
return text, {}, [], "0/1"
|
| 85 |
|
| 86 |
+
print("Extracting key data")
|
| 87 |
key_data = extract_key_data(text)
|
| 88 |
+
print("Detecting risks")
|
| 89 |
risks = detect_risks(key_data)
|
| 90 |
status = "✅ Processed" if not risks else "⚠️ Processed with risks"
|
| 91 |
|
| 92 |
# Mock CLM integration with predefined fields
|
| 93 |
+
clm_fields = {"Name": f"Contract_{len(contract_data) + 1}", "Type": object_type, "Status": status}
|
| 94 |
clm_fields.update(key_data)
|
| 95 |
|
| 96 |
contract_id = f"Contract_{len(contract_data) + 1}"
|
|
|
|
| 114 |
# Gradio UI
|
| 115 |
with gr.Blocks(title="Contract Intelligence App") as demo:
|
| 116 |
with gr.Row():
|
| 117 |
+
file_input = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contracts")
|
| 118 |
upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
|
| 119 |
|
| 120 |
object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
|
| 121 |
|
| 122 |
+
process_button = gr.Button("Process Contracts")
|
| 123 |
status_output = gr.Textbox(label="Status", interactive=False)
|
| 124 |
extracted_data_output = gr.JSON(label="Extracted Data")
|
| 125 |
risks_output = gr.Textbox(label="Detected Risks", interactive=False)
|
| 126 |
|
| 127 |
+
def process_and_display(files, obj_type):
|
| 128 |
+
if not files:
|
| 129 |
+
return "❌ No files uploaded.", {}, "No risks detected", gr.update(value="0/0")
|
| 130 |
+
results = []
|
| 131 |
+
all_data = {}
|
| 132 |
+
all_risks = []
|
| 133 |
+
for file in files:
|
| 134 |
+
status, data, risks, _ = process_contract(file, obj_type)
|
| 135 |
+
results.append(f"{status} - File: {os.path.basename(file.decode() if isinstance(file, bytes) else file)}")
|
| 136 |
+
all_data.update({f"File_{len(all_data)}": data})
|
| 137 |
+
all_risks.extend(risks)
|
| 138 |
+
progress = f"{len(files)}/{len(files)}"
|
| 139 |
+
return "\n".join(results), all_data, "\n".join(all_risks) if all_risks else "No risks detected", gr.update(value=progress)
|
| 140 |
|
| 141 |
process_button.click(
|
| 142 |
fn=process_and_display,
|
|
|
|
| 149 |
search_results = gr.JSON(label="Search Results")
|
| 150 |
search_button = gr.Button("Search")
|
| 151 |
|
|
|
|
|
|
|
|
|
|
| 152 |
search_button.click(
|
| 153 |
+
fn=search_contracts,
|
| 154 |
inputs=search_query,
|
| 155 |
outputs=search_results
|
| 156 |
)
|