File size: 5,101 Bytes
1e906e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
DOCX Parser for AI Writer.
Extracts text from .docx files for dataset and knowledge base processing.
"""

import os
from docx import Document
from typing import List, Dict, Optional


def parse_docx(file_path: str) -> str:
    """Extract all text from a single .docx file."""
    try:
        doc = Document(file_path)
        paragraphs = []
        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()
            if text:
                paragraphs.append(text)
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    text = cell.text.strip()
                    if text:
                        paragraphs.append(text)
        return "\n".join(paragraphs)
    except Exception as e:
        return f"Error parsing {file_path}: {str(e)}"


def parse_multiple_docx(file_paths: List[str]) -> Dict[str, str]:
    """Extract text from multiple .docx files. Returns dict of filename -> content."""
    results = {}
    for path in file_paths:
        if path.endswith('.docx'):
            filename = os.path.basename(path)
            results[filename] = parse_docx(path)
    return results


def extract_style_features(text: str) -> Dict:
    """Analyze text to extract writing style features."""
    features = {
        "avg_sentence_length": 0,
        "avg_paragraph_length": 0,
        "contraction_count": 0,
        "sentence_starts_with_conjunction": 0,
        "total_sentences": 0,
        "total_paragraphs": 0,
        "total_words": 0,
    }

    if not text.strip():
        return features

    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    features["total_paragraphs"] = len(paragraphs)

    all_sentences = []
    for para in paragraphs:
        # Simple sentence splitting
        sentences = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if s.strip()]
        all_sentences.extend(sentences)

    features["total_sentences"] = len(all_sentences)

    words = text.split()
    features["total_words"] = len(words)

    if features["total_sentences"] > 0:
        features["avg_sentence_length"] = features["total_words"] / features["total_sentences"]

    if features["total_paragraphs"] > 0:
        features["avg_paragraph_length"] = features["total_words"] / features["total_paragraphs"]

    # Count contractions
    contractions = ["n't", "'re", "'ve", "'ll", "'s", "'m", "'d"]
    for c in contractions:
        features["contraction_count"] += text.lower().count(c)

    # Count sentences starting with conjunctions
    conjunction_starts = ["but", "and", "so", "still", "yet", "or", "however"]
    for sentence in all_sentences:
        first_word = sentence.split()[0].lower() if sentence.split() else ""
        if first_word in conjunction_starts:
            features["sentence_starts_with_conjunction"] += 1

    return features


def build_style_profile(texts: Dict[str, str]) -> str:
    """Build a writing style profile from multiple texts."""
    all_text = "\n".join(texts.values())
    features = extract_style_features(all_text)

    profile_parts = [
        f"Writing Style Profile (analyzed from {len(texts)} document(s)):",
        f"- Average sentence length: {features['avg_sentence_length']:.1f} words",
        f"- Average paragraph length: {features['avg_paragraph_length']:.1f} words",
        f"- Total words analyzed: {features['total_words']}",
        f"- Contractions used: {features['contraction_count']}",
        f"- Sentences starting with conjunctions: {features['sentence_starts_with_conjunction']}",
        f"- Total sentences: {features['total_sentences']}",
        f"- Total paragraphs: {features['total_paragraphs']}",
    ]

    # Add sample sentences for style reference
    sentences = []
    for text in texts.values():
        for para in text.split('\n'):
            para = para.strip()
            if para and len(para) > 20:
                sents = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if len(s.strip()) > 15]
                sentences.extend(sents[:3])

    if sentences:
        profile_parts.append("\nSample sentences for style reference:")
        for i, sent in enumerate(sentences[:15], 1):
            profile_parts.append(f"  {i}. {sent}")

    return "\n".join(profile_parts)


def build_knowledge_base_summary(text: str, max_length: int = 8000) -> str:
    """Create a condensed summary of knowledge base content for context injection."""
    if len(text) <= max_length:
        return text

    # Simple extraction: take first portion and key paragraphs
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]

    # Take first 30% and last 10% to capture intro and conclusion
    first_count = max(1, int(len(paragraphs) * 0.3))
    last_count = max(1, int(len(paragraphs) * 0.1))

    selected = paragraphs[:first_count] + ["..."] + paragraphs[-last_count:]

    result = "\n".join(selected)
    if len(result) > max_length:
        result = result[:max_length]

    return result