Spaces:

Basementup
/

Litigationstation

Sleeping

App Files Files Community

Basementup commited on Apr 16

Commit

ff5486a

verified ·

1 Parent(s): efc2dd4

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -27

app.py CHANGED Viewed

@@ -8,8 +8,9 @@ from datetime import datetime
 from huggingface_hub import HfApi
 from pypdf import PdfReader
 import docx
-DATA_FILE = "/home/ubuntu/legislation_rules.json"
 def load_data():
     if os.path.exists(DATA_FILE):
@@ -32,7 +33,7 @@ def add_rule_manually(act, title, text, source):
     det_id = get_canonical_hash(text)
     if any(r['deterministic_id'] == det_id for r in data):
-        return "Error: This rule already exists in the dataset (matching hash)."
     new_rule = {
         "act": act,
@@ -46,12 +47,18 @@ def add_rule_manually(act, title, text, source):
     save_data(data)
     return f"Successfully added: {title} from {act}"
-def process_document(file_path, act_name):
     """
-    Extracts text from PDF or DOCX and adds it as a rule.
     """
-    if not file_path:
-        return "No file provided."
     ext = os.path.splitext(file_path)[1].lower()
     text = ""
@@ -61,31 +68,35 @@ def process_document(file_path, act_name):
         if ext == ".pdf":
             reader = PdfReader(file_path)
             for page in reader.pages:
-                text += page.extract_text() + "\n"
         elif ext == ".docx":
             doc = docx.Document(file_path)
             text = "\n".join([para.text for para in doc.paragraphs])
         elif ext == ".txt":
-            with open(file_path, 'r') as f:
                 text = f.read()
         else:
-            return f"Unsupported file type: {ext}"
         if not text.strip():
-            return "Error: Could not extract text from document."
         return add_rule_manually(act_name, title, text, f"Uploaded File: {title}")
     except Exception as e:
-        return f"Document processing failed: {str(e)}"
 def scrape_fca_prin():
     url = "https://handbook.fca.org.uk/handbook/PRIN/2/1.html"
     try:
-        response = requests.get(url)
         soup = BeautifulSoup(response.content, 'html.parser')
         principles_table = soup.find('table')
         if not principles_table:
-            return "Error: Could not find the Principles table on the FCA site."
         rows = principles_table.find_all('tr')
         added_count = 0
         for row in rows:
@@ -135,23 +146,23 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Tab("➕ Add Rule"):
         with gr.Row():
             with gr.Column():
-                gr.Markdown("#### Manual Entry")
-                act_input = gr.Textbox(label="Act/Legislation Name")
-                title_input = gr.Textbox(label="Section/Rule Title")
-                source_input = gr.Textbox(label="Source URL")
-                text_input = gr.TextArea(label="Rule Text")
-                add_btn = gr.Button("Add Manually", variant="primary")
                 gr.Markdown("---")
-                gr.Markdown("#### Document Ingestion")
-                doc_act_name = gr.Textbox(label="Legislation Name for Document", placeholder="e.g., Internal Policy 2024")
-                doc_input = gr.File(label="Upload PDF, DOCX, or TXT")
-                doc_btn = gr.Button("Process Document", variant="secondary")
             with gr.Column():
-                status_out = gr.Textbox(label="Status")
-                stats_view = gr.Textbox(label="Dataset Stats", value=view_dataset_stats())
-                refresh_btn = gr.Button("Refresh Stats")
     with gr.Tab("🏦 FCA Guidelines"):
         gr.Markdown("### 🛠️ FCA Handbook Automation")
@@ -161,7 +172,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     with gr.Tab("☁️ Sync to Hugging Face"):
         hf_token = gr.Textbox(label="HF Write Token", type="password")
         hf_id = gr.Textbox(label="Dataset ID")
-        sync_btn = gr.Button("Sync Now")
         sync_status = gr.Textbox(label="Sync Status")
     add_btn.click(

 from huggingface_hub import HfApi
 from pypdf import PdfReader
 import docx
+import traceback
+DATA_FILE = "legislation_rules.json" # Relative path for HF environment
 def load_data():
     if os.path.exists(DATA_FILE):
     det_id = get_canonical_hash(text)
     if any(r['deterministic_id'] == det_id for r in data):
+        return f"Warning: Rule '{title}' already exists (matching hash)."
     new_rule = {
         "act": act,
     save_data(data)
     return f"Successfully added: {title} from {act}"
+def process_document(file_obj, act_name):
     """
+    Robustly extracts text from PDF, DOCX, or TXT files.
     """
+    if file_obj is None:
+        return "Error: No file was uploaded."
+    # Handle Gradio's file object (it can be a string path or a file-like object)
+    file_path = file_obj.name if hasattr(file_obj, 'name') else file_obj
+    if not os.path.exists(file_path):
+        return f"Error: File not found at {file_path}. Please try uploading again."
     ext = os.path.splitext(file_path)[1].lower()
     text = ""
         if ext == ".pdf":
             reader = PdfReader(file_path)
             for page in reader.pages:
+                extracted = page.extract_text()
+                if extracted:
+                    text += extracted + "\n"
         elif ext == ".docx":
             doc = docx.Document(file_path)
             text = "\n".join([para.text for para in doc.paragraphs])
         elif ext == ".txt":
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                 text = f.read()
         else:
+            return f"Error: Unsupported file type '{ext}'. Use PDF, DOCX, or TXT."
         if not text.strip():
+            return "Error: Could not extract any text. The file might be empty or an image-based PDF."
         return add_rule_manually(act_name, title, text, f"Uploaded File: {title}")
     except Exception as e:
+        error_msg = traceback.format_exc()
+        print(error_msg) # Log to HF console
+        return f"Processing Failed: {str(e)}"
 def scrape_fca_prin():
     url = "https://handbook.fca.org.uk/handbook/PRIN/2/1.html"
     try:
+        response = requests.get(url, timeout=10)
         soup = BeautifulSoup(response.content, 'html.parser')
         principles_table = soup.find('table')
         if not principles_table:
+            return "Error: Could not find the Principles table. The FCA site structure may have changed."
         rows = principles_table.find_all('tr')
         added_count = 0
         for row in rows:
     with gr.Tab("➕ Add Rule"):
         with gr.Row():
             with gr.Column():
+                gr.Markdown("#### 📂 Option A: Document Ingestion")
+                doc_act_name = gr.Textbox(label="Legislation/Source Name", placeholder="e.g., Consumer Rights Act 2015")
+                doc_input = gr.File(label="Upload PDF, DOCX, or TXT")
+                doc_btn = gr.Button("Process Document", variant="primary")
                 gr.Markdown("---")
+                gr.Markdown("#### ✍️ Option B: Manual Entry")
+                act_input = gr.Textbox(label="Act Name")
+                title_input = gr.Textbox(label="Section Title")
+                source_input = gr.Textbox(label="Source URL")
+                text_input = gr.TextArea(label="Rule Text")
+                add_btn = gr.Button("Add Manually")
             with gr.Column():
+                status_out = gr.Textbox(label="Operation Status")
+                stats_view = gr.Textbox(label="Dataset Inventory", value=view_dataset_stats())
+                refresh_btn = gr.Button("Refresh Inventory")
     with gr.Tab("🏦 FCA Guidelines"):
         gr.Markdown("### 🛠️ FCA Handbook Automation")
     with gr.Tab("☁️ Sync to Hugging Face"):
         hf_token = gr.Textbox(label="HF Write Token", type="password")
         hf_id = gr.Textbox(label="Dataset ID")
+        sync_btn = gr.Button("Sync to Dataset")
         sync_status = gr.Textbox(label="Sync Status")
     add_btn.click(