Spaces:

coingimp
/

ai-writer

Sleeping

App Files Files Community

coingimp commited on Apr 13

Commit

1e906e3

verified ·

1 Parent(s): cbff270

Create docx_parser.py

Browse files

Files changed (1) hide show

utils/docx_parser.py +143 -0

utils/docx_parser.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+DOCX Parser for AI Writer.
+Extracts text from .docx files for dataset and knowledge base processing.
+"""
+import os
+from docx import Document
+from typing import List, Dict, Optional
+def parse_docx(file_path: str) -> str:
+    """Extract all text from a single .docx file."""
+    try:
+        doc = Document(file_path)
+        paragraphs = []
+        for paragraph in doc.paragraphs:
+            text = paragraph.text.strip()
+            if text:
+                paragraphs.append(text)
+        # Also extract text from tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    text = cell.text.strip()
+                    if text:
+                        paragraphs.append(text)
+        return "\n".join(paragraphs)
+    except Exception as e:
+        return f"Error parsing {file_path}: {str(e)}"
+def parse_multiple_docx(file_paths: List[str]) -> Dict[str, str]:
+    """Extract text from multiple .docx files. Returns dict of filename -> content."""
+    results = {}
+    for path in file_paths:
+        if path.endswith('.docx'):
+            filename = os.path.basename(path)
+            results[filename] = parse_docx(path)
+    return results
+def extract_style_features(text: str) -> Dict:
+    """Analyze text to extract writing style features."""
+    features = {
+        "avg_sentence_length": 0,
+        "avg_paragraph_length": 0,
+        "contraction_count": 0,
+        "sentence_starts_with_conjunction": 0,
+        "total_sentences": 0,
+        "total_paragraphs": 0,
+        "total_words": 0,
+    }
+    if not text.strip():
+        return features
+    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+    features["total_paragraphs"] = len(paragraphs)
+    all_sentences = []
+    for para in paragraphs:
+        # Simple sentence splitting
+        sentences = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if s.strip()]
+        all_sentences.extend(sentences)
+    features["total_sentences"] = len(all_sentences)
+    words = text.split()
+    features["total_words"] = len(words)
+    if features["total_sentences"] > 0:
+        features["avg_sentence_length"] = features["total_words"] / features["total_sentences"]
+    if features["total_paragraphs"] > 0:
+        features["avg_paragraph_length"] = features["total_words"] / features["total_paragraphs"]
+    # Count contractions
+    contractions = ["n't", "'re", "'ve", "'ll", "'s", "'m", "'d"]
+    for c in contractions:
+        features["contraction_count"] += text.lower().count(c)
+    # Count sentences starting with conjunctions
+    conjunction_starts = ["but", "and", "so", "still", "yet", "or", "however"]
+    for sentence in all_sentences:
+        first_word = sentence.split()[0].lower() if sentence.split() else ""
+        if first_word in conjunction_starts:
+            features["sentence_starts_with_conjunction"] += 1
+    return features
+def build_style_profile(texts: Dict[str, str]) -> str:
+    """Build a writing style profile from multiple texts."""
+    all_text = "\n".join(texts.values())
+    features = extract_style_features(all_text)
+    profile_parts = [
+        f"Writing Style Profile (analyzed from {len(texts)} document(s)):",
+        f"- Average sentence length: {features['avg_sentence_length']:.1f} words",
+        f"- Average paragraph length: {features['avg_paragraph_length']:.1f} words",
+        f"- Total words analyzed: {features['total_words']}",
+        f"- Contractions used: {features['contraction_count']}",
+        f"- Sentences starting with conjunctions: {features['sentence_starts_with_conjunction']}",
+        f"- Total sentences: {features['total_sentences']}",
+        f"- Total paragraphs: {features['total_paragraphs']}",
+    ]
+    # Add sample sentences for style reference
+    sentences = []
+    for text in texts.values():
+        for para in text.split('\n'):
+            para = para.strip()
+            if para and len(para) > 20:
+                sents = [s.strip() for s in para.replace('!', '.').replace('?', '.').split('.') if len(s.strip()) > 15]
+                sentences.extend(sents[:3])
+    if sentences:
+        profile_parts.append("\nSample sentences for style reference:")
+        for i, sent in enumerate(sentences[:15], 1):
+            profile_parts.append(f"  {i}. {sent}")
+    return "\n".join(profile_parts)
+def build_knowledge_base_summary(text: str, max_length: int = 8000) -> str:
+    """Create a condensed summary of knowledge base content for context injection."""
+    if len(text) <= max_length:
+        return text
+    # Simple extraction: take first portion and key paragraphs
+    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
+    # Take first 30% and last 10% to capture intro and conclusion
+    first_count = max(1, int(len(paragraphs) * 0.3))
+    last_count = max(1, int(len(paragraphs) * 0.1))
+    selected = paragraphs[:first_count] + ["..."] + paragraphs[-last_count:]
+    result = "\n".join(selected)
+    if len(result) > max_length:
+        result = result[:max_length]
+    return result