Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -122,6 +122,197 @@ def table_figure_reference_check(paragraphs, doc_type):
|
|
| 122 |
|
| 123 |
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def double_period_check(paragraphs):
|
| 126 |
incorrect_sentences = []
|
| 127 |
|
|
@@ -222,21 +413,20 @@ def process_document(file_obj, doc_type, template_type):
|
|
| 222 |
paragraphs = [para.text for para in doc.paragraphs]
|
| 223 |
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
|
| 224 |
|
| 225 |
-
#
|
| 226 |
heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
|
| 227 |
acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
|
| 228 |
legal_valid, incorrect_legal_references = legal_check(paragraphs)
|
| 229 |
table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
|
| 230 |
figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
|
| 231 |
references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
|
|
|
|
| 232 |
double_period_valid, incorrect_sentences = double_period_check(paragraphs)
|
| 233 |
spacing_valid, incorrect_spacing = spacing_check(paragraphs)
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
# Ensure title style check is included
|
| 237 |
-
title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
|
| 238 |
|
| 239 |
-
#
|
| 240 |
results = format_results_for_gradio(
|
| 241 |
heading_valid=heading_valid, headings_found=headings_found,
|
| 242 |
acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
|
|
@@ -244,10 +434,11 @@ def process_document(file_obj, doc_type, template_type):
|
|
| 244 |
table_valid=table_valid, incorrect_captions=incorrect_captions,
|
| 245 |
figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
|
| 246 |
references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
|
|
|
|
| 247 |
double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
|
| 248 |
spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
|
| 249 |
-
|
| 250 |
-
placeholder_issues=placeholder_issues,
|
| 251 |
required_headings=required_headings, doc_type=doc_type
|
| 252 |
)
|
| 253 |
return results
|
|
@@ -255,22 +446,6 @@ def process_document(file_obj, doc_type, template_type):
|
|
| 255 |
print(f"Error in process_document: {str(e)}")
|
| 256 |
return f"An error occurred while processing the document: {str(e)}"
|
| 257 |
|
| 258 |
-
def get_document_checks(doc_type, template_type):
|
| 259 |
-
"""Return the required headings and other checks based on document type."""
|
| 260 |
-
if doc_type == "Advisory Circular":
|
| 261 |
-
if template_type == "Short AC template AC":
|
| 262 |
-
return {
|
| 263 |
-
"required_headings": ["Purpose", "Applicability", "Related Reading Material",
|
| 264 |
-
"Background", "Discussion"]
|
| 265 |
-
}
|
| 266 |
-
else: # Long AC template
|
| 267 |
-
return {
|
| 268 |
-
"required_headings": ["Purpose", "Applicability", "Audience", "Related Reading Material",
|
| 269 |
-
"Background", "Discussion", "Conclusion"]
|
| 270 |
-
}
|
| 271 |
-
# Add other document types as needed
|
| 272 |
-
return {"required_headings": []}
|
| 273 |
-
|
| 274 |
def format_results_for_gradio(**kwargs):
|
| 275 |
"""Format the results for display in Gradio."""
|
| 276 |
results = []
|
|
@@ -473,8 +648,7 @@ with demo:
|
|
| 473 |
with gr.Column(scale=2):
|
| 474 |
output = gr.Markdown(
|
| 475 |
label="Check Results",
|
| 476 |
-
|
| 477 |
-
)
|
| 478 |
|
| 479 |
# Update template type visibility based on document type
|
| 480 |
def update_template_visibility(doc_type):
|
|
|
|
| 122 |
|
| 123 |
return len(incorrect_table_figure_references) == 0, incorrect_table_figure_references
|
| 124 |
|
| 125 |
+
def document_title_check(doc_path, doc_type):
|
| 126 |
+
incorrect_titles = []
|
| 127 |
+
doc = Document(doc_path)
|
| 128 |
+
|
| 129 |
+
# Updated pattern to capture titles correctly
|
| 130 |
+
ac_pattern = re.compile(r'AC\s+\d+(?:-\d+)?(?:,|\s)+(.+?)(?=\.|,|$)')
|
| 131 |
+
|
| 132 |
+
# Define formatting rules for different document types
|
| 133 |
+
formatting_rules = {
|
| 134 |
+
"Advisory Circular": {"italics": True, "quotes": False},
|
| 135 |
+
"Airworthiness Criteria": {"italics": False, "quotes": True},
|
| 136 |
+
"Deviation Memo": {"italics": False, "quotes": True},
|
| 137 |
+
"Exemption": {"italics": False, "quotes": True},
|
| 138 |
+
"Federal Register Notice": {"italics": False, "quotes": True},
|
| 139 |
+
"Handbook/Manual": {"italics": False, "quotes": False},
|
| 140 |
+
"Order": {"italics": False, "quotes": True},
|
| 141 |
+
"Policy Statement": {"italics": False, "quotes": False},
|
| 142 |
+
"Rule": {"italics": False, "quotes": True},
|
| 143 |
+
"Special Condition": {"italics": False, "quotes": True},
|
| 144 |
+
"Technical Standard Order": {"italics": False, "quotes": True},
|
| 145 |
+
"Other": {"italics": False, "quotes": False}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
# Get the rules for the current document type
|
| 149 |
+
if doc_type not in formatting_rules:
|
| 150 |
+
raise ValueError(f"Unsupported document type: {doc_type}")
|
| 151 |
+
|
| 152 |
+
required_format = formatting_rules[doc_type]
|
| 153 |
+
|
| 154 |
+
for paragraph in doc.paragraphs:
|
| 155 |
+
text = paragraph.text
|
| 156 |
+
matches = ac_pattern.finditer(text)
|
| 157 |
+
|
| 158 |
+
for match in matches:
|
| 159 |
+
full_match = match.group(0)
|
| 160 |
+
title_text = match.group(1).strip()
|
| 161 |
+
|
| 162 |
+
# Get the position where the title starts
|
| 163 |
+
title_start = match.start(1)
|
| 164 |
+
|
| 165 |
+
# Check for any type of quotation marks, including smart quotes
|
| 166 |
+
title_in_quotes = any(q in title_text for q in ['"', "'", '"', '"', ''', '''])
|
| 167 |
+
|
| 168 |
+
# Check the formatting of the title
|
| 169 |
+
title_is_italicized = False
|
| 170 |
+
current_pos = 0
|
| 171 |
+
for run in paragraph.runs:
|
| 172 |
+
run_length = len(run.text)
|
| 173 |
+
if current_pos <= title_start < current_pos + run_length:
|
| 174 |
+
relative_pos = title_start - current_pos
|
| 175 |
+
title_is_italicized = run.italic
|
| 176 |
+
break
|
| 177 |
+
current_pos += run_length
|
| 178 |
+
|
| 179 |
+
# Check if formatting matches the required format
|
| 180 |
+
formatting_incorrect = False
|
| 181 |
+
issue_message = []
|
| 182 |
+
|
| 183 |
+
# Check italics requirement
|
| 184 |
+
if required_format["italics"] and not title_is_italicized:
|
| 185 |
+
formatting_incorrect = True
|
| 186 |
+
issue_message.append("should be italicized")
|
| 187 |
+
elif not required_format["italics"] and title_is_italicized:
|
| 188 |
+
formatting_incorrect = True
|
| 189 |
+
issue_message.append("should not be italicized")
|
| 190 |
+
|
| 191 |
+
# Check quotes requirement
|
| 192 |
+
if required_format["quotes"] and not title_in_quotes:
|
| 193 |
+
formatting_incorrect = True
|
| 194 |
+
issue_message.append("should be in quotes")
|
| 195 |
+
elif not required_format["quotes"] and title_in_quotes:
|
| 196 |
+
formatting_incorrect = True
|
| 197 |
+
issue_message.append("should not be in quotes")
|
| 198 |
+
|
| 199 |
+
if formatting_incorrect:
|
| 200 |
+
incorrect_titles.append({
|
| 201 |
+
'text': full_match,
|
| 202 |
+
'issue': ', '.join(issue_message)
|
| 203 |
+
})
|
| 204 |
+
|
| 205 |
+
return len(incorrect_titles) == 0, incorrect_titles
|
| 206 |
+
|
| 207 |
+
def get_document_checks(doc_type, template_type):
|
| 208 |
+
"""Return the required headings and other checks based on document type."""
|
| 209 |
+
document_checks = {
|
| 210 |
+
"Advisory Circular": {
|
| 211 |
+
"Short AC template AC": {
|
| 212 |
+
"required_headings": [
|
| 213 |
+
"PURPOSE.",
|
| 214 |
+
"APPLICABILITY.",
|
| 215 |
+
"CANCELLATION.",
|
| 216 |
+
"RELATED MATERIAL.",
|
| 217 |
+
"DEFINITION OF KEY TERMS."
|
| 218 |
+
]
|
| 219 |
+
},
|
| 220 |
+
"Long AC template AC": {
|
| 221 |
+
"required_headings": [
|
| 222 |
+
"Purpose.",
|
| 223 |
+
"Applicability.",
|
| 224 |
+
"Cancellation.",
|
| 225 |
+
"Related Material.",
|
| 226 |
+
"Definition of Key Terms."
|
| 227 |
+
]
|
| 228 |
+
}
|
| 229 |
+
},
|
| 230 |
+
"Airworthiness Criteria": {
|
| 231 |
+
"required_headings": [
|
| 232 |
+
"TBD - Need to research"
|
| 233 |
+
]
|
| 234 |
+
},
|
| 235 |
+
"Deviation Memo": {
|
| 236 |
+
"required_headings": [
|
| 237 |
+
"TBD - Need to research"
|
| 238 |
+
]
|
| 239 |
+
},
|
| 240 |
+
"Exemption": {
|
| 241 |
+
"required_headings": [
|
| 242 |
+
"TBD - Need to research"
|
| 243 |
+
]
|
| 244 |
+
},
|
| 245 |
+
"Federal Register Notice": {
|
| 246 |
+
"required_headings": [
|
| 247 |
+
"Purpose of This Notice",
|
| 248 |
+
"Audience",
|
| 249 |
+
"Where can I Find This Notice"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
"Handbook/Manual": {
|
| 253 |
+
"required_headings": [
|
| 254 |
+
"TBD - Need to research"
|
| 255 |
+
]
|
| 256 |
+
},
|
| 257 |
+
"Order": {
|
| 258 |
+
"required_headings": [
|
| 259 |
+
"Purpose of This Order.",
|
| 260 |
+
"Audience.",
|
| 261 |
+
"Where to Find This Order."
|
| 262 |
+
]
|
| 263 |
+
},
|
| 264 |
+
"Policy Statement": {
|
| 265 |
+
"required_headings": [
|
| 266 |
+
"SUMMARY",
|
| 267 |
+
"CURRENT REGULATORY AND ADVISORY MATERIAL",
|
| 268 |
+
"RELEVANT PAST PRACTICE",
|
| 269 |
+
"POLICY",
|
| 270 |
+
"EFFECT OF POLICY",
|
| 271 |
+
"CONCLUSION"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
"Rule": {
|
| 275 |
+
"required_headings": [
|
| 276 |
+
"TBD - Need to research"
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
"Special Condition": {
|
| 280 |
+
"required_headings": [
|
| 281 |
+
"TBD - Need to research"
|
| 282 |
+
]
|
| 283 |
+
},
|
| 284 |
+
"Technical Standard Order": {
|
| 285 |
+
"required_headings": [
|
| 286 |
+
"PURPOSE.",
|
| 287 |
+
"APPLICABILITY.",
|
| 288 |
+
"REQUIREMENTS.",
|
| 289 |
+
"MARKING.",
|
| 290 |
+
"APPLICATION DATA REQUIREMENTS.",
|
| 291 |
+
"MANUFACTURER DATA REQUIREMENTS.",
|
| 292 |
+
"FURNISHED DATA REQUIREMENTS.",
|
| 293 |
+
"HOW TO GET REFERENCED DOCUMENTS."
|
| 294 |
+
]
|
| 295 |
+
},
|
| 296 |
+
"Other": {
|
| 297 |
+
"required_headings": [
|
| 298 |
+
"N/A"
|
| 299 |
+
]
|
| 300 |
+
}
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
# Add debugging logs
|
| 304 |
+
logger = logging.getLogger(__name__)
|
| 305 |
+
logger.info(f"Requested document type: {doc_type}")
|
| 306 |
+
logger.info(f"Requested template type: {template_type}")
|
| 307 |
+
|
| 308 |
+
if doc_type == "Advisory Circular":
|
| 309 |
+
checks = document_checks.get(doc_type, {}).get(template_type, {})
|
| 310 |
+
else:
|
| 311 |
+
checks = document_checks.get(doc_type, {})
|
| 312 |
+
|
| 313 |
+
logger.info(f"Retrieved checks: {checks}")
|
| 314 |
+
return checks
|
| 315 |
+
|
| 316 |
def double_period_check(paragraphs):
|
| 317 |
incorrect_sentences = []
|
| 318 |
|
|
|
|
| 413 |
paragraphs = [para.text for para in doc.paragraphs]
|
| 414 |
required_headings = get_document_checks(doc_type, template_type).get("required_headings", [])
|
| 415 |
|
| 416 |
+
# Perform each check with `paragraphs` as input
|
| 417 |
heading_valid, headings_found = heading_title_check(paragraphs, required_headings)
|
| 418 |
acronyms_valid, undefined_acronyms = acronym_check(paragraphs)
|
| 419 |
legal_valid, incorrect_legal_references = legal_check(paragraphs)
|
| 420 |
table_valid, incorrect_captions = table_caption_check(paragraphs, doc_type)
|
| 421 |
figure_valid, incorrect_fig_captions = figure_caption_check(paragraphs, doc_type)
|
| 422 |
references_valid, incorrect_table_figure_references = table_figure_reference_check(paragraphs, doc_type)
|
| 423 |
+
title_style_valid, incorrect_titles = document_title_check(file_obj, doc_type) if doc_type in ["Advisory Circular", "Order"] else (True, [])
|
| 424 |
double_period_valid, incorrect_sentences = double_period_check(paragraphs)
|
| 425 |
spacing_valid, incorrect_spacing = spacing_check(paragraphs)
|
| 426 |
+
date_issues = check_date_formats(paragraphs) # Pass paragraphs here
|
| 427 |
+
placeholder_issues = check_placeholders(paragraphs) # Pass paragraphs here
|
|
|
|
|
|
|
| 428 |
|
| 429 |
+
# Format results
|
| 430 |
results = format_results_for_gradio(
|
| 431 |
heading_valid=heading_valid, headings_found=headings_found,
|
| 432 |
acronyms_valid=acronyms_valid, undefined_acronyms=undefined_acronyms,
|
|
|
|
| 434 |
table_valid=table_valid, incorrect_captions=incorrect_captions,
|
| 435 |
figure_valid=figure_valid, incorrect_fig_captions=incorrect_fig_captions,
|
| 436 |
references_valid=references_valid, incorrect_table_figure_references=incorrect_table_figure_references,
|
| 437 |
+
title_style_valid=title_style_valid, incorrect_titles=incorrect_titles,
|
| 438 |
double_period_valid=double_period_valid, incorrect_sentences=incorrect_sentences,
|
| 439 |
spacing_valid=spacing_valid, incorrect_spacing=incorrect_spacing,
|
| 440 |
+
date_issues=date_issues, # Added date_issues
|
| 441 |
+
placeholder_issues=placeholder_issues, # Added placeholder_issues
|
| 442 |
required_headings=required_headings, doc_type=doc_type
|
| 443 |
)
|
| 444 |
return results
|
|
|
|
| 446 |
print(f"Error in process_document: {str(e)}")
|
| 447 |
return f"An error occurred while processing the document: {str(e)}"
|
| 448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
def format_results_for_gradio(**kwargs):
|
| 450 |
"""Format the results for display in Gradio."""
|
| 451 |
results = []
|
|
|
|
| 648 |
with gr.Column(scale=2):
|
| 649 |
output = gr.Markdown(
|
| 650 |
label="Check Results",
|
| 651 |
+
).markdown("Results will appear here after processing...")
|
|
|
|
| 652 |
|
| 653 |
# Update template type visibility based on document type
|
| 654 |
def update_template_visibility(doc_type):
|