| | """ |
| | Document type validation utility |
| | Helps identify if uploaded documents are actually patents |
| | """ |
| | import re |
| | from pathlib import Path |
| | from typing import Tuple, List |
| | from loguru import logger |
| |
|
| |
|
| | class DocumentValidator: |
| | """Validate that uploaded documents are patents""" |
| |
|
| | |
| | PATENT_KEYWORDS = [ |
| | 'patent', 'claim', 'claims', 'invention', 'abstract', |
| | 'field of invention', 'background', 'detailed description', |
| | 'inventor', 'assignee', 'filing date', 'application' |
| | ] |
| |
|
| | |
| | REQUIRED_SECTIONS = ['abstract', 'claim'] |
| |
|
| | @staticmethod |
| | def validate_patent_document(text: str) -> Tuple[bool, List[str]]: |
| | """ |
| | Validate if document text appears to be a patent |
| | |
| | Args: |
| | text: Extracted document text |
| | |
| | Returns: |
| | Tuple of (is_valid, issues_found) |
| | """ |
| | text_lower = text.lower() |
| | issues = [] |
| |
|
| | |
| | if len(text) < 500: |
| | issues.append("Document too short (< 500 characters)") |
| |
|
| | |
| | keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS |
| | if kw in text_lower) |
| |
|
| | if keyword_matches < 3: |
| | issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)") |
| |
|
| | |
| | missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS |
| | if section not in text_lower] |
| |
|
| | if missing_sections: |
| | issues.append(f"Missing required sections: {', '.join(missing_sections)}") |
| |
|
| | |
| | claim_pattern = r'claim\s+\d+' |
| | claims_found = len(re.findall(claim_pattern, text_lower)) |
| |
|
| | if claims_found == 0: |
| | issues.append("No numbered claims found") |
| |
|
| | |
| | is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0) |
| |
|
| | if not is_valid: |
| | logger.warning(f"Document validation failed: {issues}") |
| |
|
| | return is_valid, issues |
| |
|
| | @staticmethod |
| | def identify_document_type(text: str) -> str: |
| | """ |
| | Try to identify what type of document this is |
| | |
| | Returns: |
| | Document type description |
| | """ |
| | text_lower = text.lower() |
| |
|
| | |
| | if 'microsoft' in text_lower and 'windows' in text_lower: |
| | return "Microsoft Windows documentation" |
| |
|
| | if any(term in text_lower for term in ['press release', 'news', 'announcement']): |
| | return "Press release or news article" |
| |
|
| | if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']): |
| | return "Technical whitepaper or report" |
| |
|
| | if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']): |
| | return "Legal agreement or policy document" |
| |
|
| | if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower: |
| | return "Academic research paper" |
| |
|
| | |
| | is_patent, _ = DocumentValidator.validate_patent_document(text) |
| | if is_patent: |
| | return "Patent document" |
| |
|
| | return "Unknown document type (not a patent)" |
| |
|
| |
|
| | def validate_and_log(text: str, document_name: str = "document") -> bool: |
| | """ |
| | Convenience function to validate and log results |
| | |
| | Args: |
| | text: Document text |
| | document_name: Name of document for logging |
| | |
| | Returns: |
| | True if valid patent, False otherwise |
| | """ |
| | is_valid, issues = DocumentValidator.validate_patent_document(text) |
| |
|
| | if not is_valid: |
| | doc_type = DocumentValidator.identify_document_type(text) |
| | logger.error(f"❌ {document_name} is NOT a valid patent") |
| | logger.error(f" Detected type: {doc_type}") |
| | logger.error(f" Issues: {', '.join(issues)}") |
| | return False |
| |
|
| | logger.success(f"✅ {document_name} appears to be a valid patent") |
| | return True |
| |
|