| """ |
| DOCX Parser for AI Writer. |
| Extracts text from .docx files for dataset and knowledge base processing. |
| """ |
|
|
| import os |
| from docx import Document |
| from typing import List, Dict, Optional |
|
|
|
|
| def parse_docx(file_path: str) -> str: |
| """Extract all text from a single .docx file.""" |
| try: |
| doc = Document(file_path) |
| paragraphs = [] |
| for paragraph in doc.paragraphs: |
| text = paragraph.text.strip() |
| if text: |
| paragraphs.append(text) |
| |
| for table in doc.tables: |
| for row in table.rows: |
| for cell in row.cells: |
| text = cell.text.strip() |
| if text: |
| paragraphs.append(text) |
| return "\n".join(paragraphs) |
| except Exception as e: |
| return f"Error parsing {file_path}: {str(e)}" |
|
|
|
|
| def parse_multiple_docx(file_paths: List[str]) -> Dict[str, str]: |
| """Extract text from multiple .docx files. Returns dict of filename -> content.""" |
| results = {} |
| for path in file_paths: |
| if path.endswith('.docx'): |
| filename = os.path.basename(path) |
| results[filename] = parse_docx(path) |
| return results |
|
|
|
|
| def extract_style_features(text: str) -> Dict: |
| """Analyze text to extract writing style features.""" |
| features = { |
| "avg_sentence_length": 0, |
| "avg_paragraph_length": 0, |
| "contraction_count": 0, |
| "sentence_starts_with_conjunction": 0, |
| "total_sentences": 0, |
| "total_paragraphs": 0, |
| "total_words": 0, |
| } |
|
|
| if not text.strip(): |
| return features |
|
|
| paragraphs = [p.strip() for p in text.split('\n') if p.strip()] |
| features["total_paragraphs"] = len(paragraphs) |
|
|
| all_sentences = [] |
| for para in paragraphs: |
| |
| sentences = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if s.strip()] |
| all_sentences.extend(sentences) |
|
|
| features["total_sentences"] = len(all_sentences) |
|
|
| words = text.split() |
| features["total_words"] = len(words) |
|
|
| if features["total_sentences"] > 0: |
| features["avg_sentence_length"] = features["total_words"] / features["total_sentences"] |
|
|
| if features["total_paragraphs"] > 0: |
| features["avg_paragraph_length"] = features["total_words"] / features["total_paragraphs"] |
|
|
| |
| contractions = ["n't", "'re", "'ve", "'ll", "'s", "'m", "'d"] |
| for c in contractions: |
| features["contraction_count"] += text.lower().count(c) |
|
|
| |
| conjunction_starts = ["but", "and", "so", "still", "yet", "or", "however"] |
| for sentence in all_sentences: |
| first_word = sentence.split()[0].lower() if sentence.split() else "" |
| if first_word in conjunction_starts: |
| features["sentence_starts_with_conjunction"] += 1 |
|
|
| return features |
|
|
|
|
| def build_style_profile(texts: Dict[str, str]) -> str: |
| """Build a writing style profile from multiple texts.""" |
| all_text = "\n".join(texts.values()) |
| features = extract_style_features(all_text) |
|
|
| profile_parts = [ |
| f"Writing Style Profile (analyzed from {len(texts)} document(s)):", |
| f"- Average sentence length: {features['avg_sentence_length']:.1f} words", |
| f"- Average paragraph length: {features['avg_paragraph_length']:.1f} words", |
| f"- Total words analyzed: {features['total_words']}", |
| f"- Contractions used: {features['contraction_count']}", |
| f"- Sentences starting with conjunctions: {features['sentence_starts_with_conjunction']}", |
| f"- Total sentences: {features['total_sentences']}", |
| f"- Total paragraphs: {features['total_paragraphs']}", |
| ] |
|
|
| |
| sentences = [] |
| for text in texts.values(): |
| for para in text.split('\n'): |
| para = para.strip() |
| if para and len(para) > 20: |
| sents = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if len(s.strip()) > 15] |
| sentences.extend(sents[:3]) |
|
|
| if sentences: |
| profile_parts.append("\nSample sentences for style reference:") |
| for i, sent in enumerate(sentences[:15], 1): |
| profile_parts.append(f" {i}. {sent}") |
|
|
| return "\n".join(profile_parts) |
|
|
|
|
| def build_knowledge_base_summary(text: str, max_length: int = 8000) -> str: |
| """Create a condensed summary of knowledge base content for context injection.""" |
| if len(text) <= max_length: |
| return text |
|
|
| |
| paragraphs = [p.strip() for p in text.split('\n') if p.strip()] |
|
|
| |
| first_count = max(1, int(len(paragraphs) * 0.3)) |
| last_count = max(1, int(len(paragraphs) * 0.1)) |
|
|
| selected = paragraphs[:first_count] + ["..."] + paragraphs[-last_count:] |
|
|
| result = "\n".join(selected) |
| if len(result) > max_length: |
| result = result[:max_length] |
|
|
| return result |
|
|