coingimp commited on
Commit
1e906e3
·
verified ·
1 Parent(s): cbff270

Create docx_parser.py

Browse files
Files changed (1) hide show
  1. utils/docx_parser.py +143 -0
utils/docx_parser.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DOCX Parser for AI Writer.
3
+ Extracts text from .docx files for dataset and knowledge base processing.
4
+ """
5
+
6
+ import os
7
+ from docx import Document
8
+ from typing import List, Dict, Optional
9
+
10
+
11
+ def parse_docx(file_path: str) -> str:
12
+ """Extract all text from a single .docx file."""
13
+ try:
14
+ doc = Document(file_path)
15
+ paragraphs = []
16
+ for paragraph in doc.paragraphs:
17
+ text = paragraph.text.strip()
18
+ if text:
19
+ paragraphs.append(text)
20
+ # Also extract text from tables
21
+ for table in doc.tables:
22
+ for row in table.rows:
23
+ for cell in row.cells:
24
+ text = cell.text.strip()
25
+ if text:
26
+ paragraphs.append(text)
27
+ return "\n".join(paragraphs)
28
+ except Exception as e:
29
+ return f"Error parsing {file_path}: {str(e)}"
30
+
31
+
32
+ def parse_multiple_docx(file_paths: List[str]) -> Dict[str, str]:
33
+ """Extract text from multiple .docx files. Returns dict of filename -> content."""
34
+ results = {}
35
+ for path in file_paths:
36
+ if path.endswith('.docx'):
37
+ filename = os.path.basename(path)
38
+ results[filename] = parse_docx(path)
39
+ return results
40
+
41
+
42
+ def extract_style_features(text: str) -> Dict:
43
+ """Analyze text to extract writing style features."""
44
+ features = {
45
+ "avg_sentence_length": 0,
46
+ "avg_paragraph_length": 0,
47
+ "contraction_count": 0,
48
+ "sentence_starts_with_conjunction": 0,
49
+ "total_sentences": 0,
50
+ "total_paragraphs": 0,
51
+ "total_words": 0,
52
+ }
53
+
54
+ if not text.strip():
55
+ return features
56
+
57
+ paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
58
+ features["total_paragraphs"] = len(paragraphs)
59
+
60
+ all_sentences = []
61
+ for para in paragraphs:
62
+ # Simple sentence splitting
63
+ sentences = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if s.strip()]
64
+ all_sentences.extend(sentences)
65
+
66
+ features["total_sentences"] = len(all_sentences)
67
+
68
+ words = text.split()
69
+ features["total_words"] = len(words)
70
+
71
+ if features["total_sentences"] > 0:
72
+ features["avg_sentence_length"] = features["total_words"] / features["total_sentences"]
73
+
74
+ if features["total_paragraphs"] > 0:
75
+ features["avg_paragraph_length"] = features["total_words"] / features["total_paragraphs"]
76
+
77
+ # Count contractions
78
+ contractions = ["n't", "'re", "'ve", "'ll", "'s", "'m", "'d"]
79
+ for c in contractions:
80
+ features["contraction_count"] += text.lower().count(c)
81
+
82
+ # Count sentences starting with conjunctions
83
+ conjunction_starts = ["but", "and", "so", "still", "yet", "or", "however"]
84
+ for sentence in all_sentences:
85
+ first_word = sentence.split()[0].lower() if sentence.split() else ""
86
+ if first_word in conjunction_starts:
87
+ features["sentence_starts_with_conjunction"] += 1
88
+
89
+ return features
90
+
91
+
92
+ def build_style_profile(texts: Dict[str, str]) -> str:
93
+ """Build a writing style profile from multiple texts."""
94
+ all_text = "\n".join(texts.values())
95
+ features = extract_style_features(all_text)
96
+
97
+ profile_parts = [
98
+ f"Writing Style Profile (analyzed from {len(texts)} document(s)):",
99
+ f"- Average sentence length: {features['avg_sentence_length']:.1f} words",
100
+ f"- Average paragraph length: {features['avg_paragraph_length']:.1f} words",
101
+ f"- Total words analyzed: {features['total_words']}",
102
+ f"- Contractions used: {features['contraction_count']}",
103
+ f"- Sentences starting with conjunctions: {features['sentence_starts_with_conjunction']}",
104
+ f"- Total sentences: {features['total_sentences']}",
105
+ f"- Total paragraphs: {features['total_paragraphs']}",
106
+ ]
107
+
108
+ # Add sample sentences for style reference
109
+ sentences = []
110
+ for text in texts.values():
111
+ for para in text.split('\n'):
112
+ para = para.strip()
113
+ if para and len(para) > 20:
114
+ sents = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if len(s.strip()) > 15]
115
+ sentences.extend(sents[:3])
116
+
117
+ if sentences:
118
+ profile_parts.append("\nSample sentences for style reference:")
119
+ for i, sent in enumerate(sentences[:15], 1):
120
+ profile_parts.append(f" {i}. {sent}")
121
+
122
+ return "\n".join(profile_parts)
123
+
124
+
125
+ def build_knowledge_base_summary(text: str, max_length: int = 8000) -> str:
126
+ """Create a condensed summary of knowledge base content for context injection."""
127
+ if len(text) <= max_length:
128
+ return text
129
+
130
+ # Simple extraction: take first portion and key paragraphs
131
+ paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
132
+
133
+ # Take first 30% and last 10% to capture intro and conclusion
134
+ first_count = max(1, int(len(paragraphs) * 0.3))
135
+ last_count = max(1, int(len(paragraphs) * 0.1))
136
+
137
+ selected = paragraphs[:first_count] + ["..."] + paragraphs[-last_count:]
138
+
139
+ result = "\n".join(selected)
140
+ if len(result) > max_length:
141
+ result = result[:max_length]
142
+
143
+ return result