Basementup commited on
Commit
ff5486a
Β·
verified Β·
1 Parent(s): efc2dd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -27
app.py CHANGED
@@ -8,8 +8,9 @@ from datetime import datetime
8
  from huggingface_hub import HfApi
9
  from pypdf import PdfReader
10
  import docx
 
11
 
12
- DATA_FILE = "/home/ubuntu/legislation_rules.json"
13
 
14
  def load_data():
15
  if os.path.exists(DATA_FILE):
@@ -32,7 +33,7 @@ def add_rule_manually(act, title, text, source):
32
  det_id = get_canonical_hash(text)
33
 
34
  if any(r['deterministic_id'] == det_id for r in data):
35
- return "Error: This rule already exists in the dataset (matching hash)."
36
 
37
  new_rule = {
38
  "act": act,
@@ -46,12 +47,18 @@ def add_rule_manually(act, title, text, source):
46
  save_data(data)
47
  return f"Successfully added: {title} from {act}"
48
 
49
- def process_document(file_path, act_name):
50
  """
51
- Extracts text from PDF or DOCX and adds it as a rule.
52
  """
53
- if not file_path:
54
- return "No file provided."
 
 
 
 
 
 
55
 
56
  ext = os.path.splitext(file_path)[1].lower()
57
  text = ""
@@ -61,31 +68,35 @@ def process_document(file_path, act_name):
61
  if ext == ".pdf":
62
  reader = PdfReader(file_path)
63
  for page in reader.pages:
64
- text += page.extract_text() + "\n"
 
 
65
  elif ext == ".docx":
66
  doc = docx.Document(file_path)
67
  text = "\n".join([para.text for para in doc.paragraphs])
68
  elif ext == ".txt":
69
- with open(file_path, 'r') as f:
70
  text = f.read()
71
  else:
72
- return f"Unsupported file type: {ext}"
73
 
74
  if not text.strip():
75
- return "Error: Could not extract text from document."
76
 
77
  return add_rule_manually(act_name, title, text, f"Uploaded File: {title}")
78
  except Exception as e:
79
- return f"Document processing failed: {str(e)}"
 
 
80
 
81
  def scrape_fca_prin():
82
  url = "https://handbook.fca.org.uk/handbook/PRIN/2/1.html"
83
  try:
84
- response = requests.get(url)
85
  soup = BeautifulSoup(response.content, 'html.parser')
86
  principles_table = soup.find('table')
87
  if not principles_table:
88
- return "Error: Could not find the Principles table on the FCA site."
89
  rows = principles_table.find_all('tr')
90
  added_count = 0
91
  for row in rows:
@@ -135,23 +146,23 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
135
  with gr.Tab("βž• Add Rule"):
136
  with gr.Row():
137
  with gr.Column():
138
- gr.Markdown("#### Manual Entry")
139
- act_input = gr.Textbox(label="Act/Legislation Name")
140
- title_input = gr.Textbox(label="Section/Rule Title")
141
- source_input = gr.Textbox(label="Source URL")
142
- text_input = gr.TextArea(label="Rule Text")
143
- add_btn = gr.Button("Add Manually", variant="primary")
144
 
145
  gr.Markdown("---")
146
- gr.Markdown("#### Document Ingestion")
147
- doc_act_name = gr.Textbox(label="Legislation Name for Document", placeholder="e.g., Internal Policy 2024")
148
- doc_input = gr.File(label="Upload PDF, DOCX, or TXT")
149
- doc_btn = gr.Button("Process Document", variant="secondary")
 
 
150
 
151
  with gr.Column():
152
- status_out = gr.Textbox(label="Status")
153
- stats_view = gr.Textbox(label="Dataset Stats", value=view_dataset_stats())
154
- refresh_btn = gr.Button("Refresh Stats")
155
 
156
  with gr.Tab("🏦 FCA Guidelines"):
157
  gr.Markdown("### πŸ› οΈ FCA Handbook Automation")
@@ -161,7 +172,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
161
  with gr.Tab("☁️ Sync to Hugging Face"):
162
  hf_token = gr.Textbox(label="HF Write Token", type="password")
163
  hf_id = gr.Textbox(label="Dataset ID")
164
- sync_btn = gr.Button("Sync Now")
165
  sync_status = gr.Textbox(label="Sync Status")
166
 
167
  add_btn.click(
 
8
  from huggingface_hub import HfApi
9
  from pypdf import PdfReader
10
  import docx
11
+ import traceback
12
 
13
+ DATA_FILE = "legislation_rules.json" # Relative path for HF environment
14
 
15
  def load_data():
16
  if os.path.exists(DATA_FILE):
 
33
  det_id = get_canonical_hash(text)
34
 
35
  if any(r['deterministic_id'] == det_id for r in data):
36
+ return f"Warning: Rule '{title}' already exists (matching hash)."
37
 
38
  new_rule = {
39
  "act": act,
 
47
  save_data(data)
48
  return f"Successfully added: {title} from {act}"
49
 
50
+ def process_document(file_obj, act_name):
51
  """
52
+ Robustly extracts text from PDF, DOCX, or TXT files.
53
  """
54
+ if file_obj is None:
55
+ return "Error: No file was uploaded."
56
+
57
+ # Handle Gradio's file object (it can be a string path or a file-like object)
58
+ file_path = file_obj.name if hasattr(file_obj, 'name') else file_obj
59
+
60
+ if not os.path.exists(file_path):
61
+ return f"Error: File not found at {file_path}. Please try uploading again."
62
 
63
  ext = os.path.splitext(file_path)[1].lower()
64
  text = ""
 
68
  if ext == ".pdf":
69
  reader = PdfReader(file_path)
70
  for page in reader.pages:
71
+ extracted = page.extract_text()
72
+ if extracted:
73
+ text += extracted + "\n"
74
  elif ext == ".docx":
75
  doc = docx.Document(file_path)
76
  text = "\n".join([para.text for para in doc.paragraphs])
77
  elif ext == ".txt":
78
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
79
  text = f.read()
80
  else:
81
+ return f"Error: Unsupported file type '{ext}'. Use PDF, DOCX, or TXT."
82
 
83
  if not text.strip():
84
+ return "Error: Could not extract any text. The file might be empty or an image-based PDF."
85
 
86
  return add_rule_manually(act_name, title, text, f"Uploaded File: {title}")
87
  except Exception as e:
88
+ error_msg = traceback.format_exc()
89
+ print(error_msg) # Log to HF console
90
+ return f"Processing Failed: {str(e)}"
91
 
92
  def scrape_fca_prin():
93
  url = "https://handbook.fca.org.uk/handbook/PRIN/2/1.html"
94
  try:
95
+ response = requests.get(url, timeout=10)
96
  soup = BeautifulSoup(response.content, 'html.parser')
97
  principles_table = soup.find('table')
98
  if not principles_table:
99
+ return "Error: Could not find the Principles table. The FCA site structure may have changed."
100
  rows = principles_table.find_all('tr')
101
  added_count = 0
102
  for row in rows:
 
146
  with gr.Tab("βž• Add Rule"):
147
  with gr.Row():
148
  with gr.Column():
149
+ gr.Markdown("#### πŸ“‚ Option A: Document Ingestion")
150
+ doc_act_name = gr.Textbox(label="Legislation/Source Name", placeholder="e.g., Consumer Rights Act 2015")
151
+ doc_input = gr.File(label="Upload PDF, DOCX, or TXT")
152
+ doc_btn = gr.Button("Process Document", variant="primary")
 
 
153
 
154
  gr.Markdown("---")
155
+ gr.Markdown("#### ✍️ Option B: Manual Entry")
156
+ act_input = gr.Textbox(label="Act Name")
157
+ title_input = gr.Textbox(label="Section Title")
158
+ source_input = gr.Textbox(label="Source URL")
159
+ text_input = gr.TextArea(label="Rule Text")
160
+ add_btn = gr.Button("Add Manually")
161
 
162
  with gr.Column():
163
+ status_out = gr.Textbox(label="Operation Status")
164
+ stats_view = gr.Textbox(label="Dataset Inventory", value=view_dataset_stats())
165
+ refresh_btn = gr.Button("Refresh Inventory")
166
 
167
  with gr.Tab("🏦 FCA Guidelines"):
168
  gr.Markdown("### πŸ› οΈ FCA Handbook Automation")
 
172
  with gr.Tab("☁️ Sync to Hugging Face"):
173
  hf_token = gr.Textbox(label="HF Write Token", type="password")
174
  hf_id = gr.Textbox(label="Dataset ID")
175
+ sync_btn = gr.Button("Sync to Dataset")
176
  sync_status = gr.Textbox(label="Sync Status")
177
 
178
  add_btn.click(