fix

Browse files

Files changed (5) hide show

text_analyzer/analyzer.py +2 -8
text_analyzer/features/base_features.py +2 -5
text_analyzer/features/linguistic_features.py +2 -5
text_analyzer/features/regex_features.py +0 -2
text_analyzer/features/structural_features.py +0 -1

text_analyzer/analyzer.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# text_analyzer/analyzer.py (WERSJA POPRAWIONA)
 """
 Główny moduł biblioteki zawierający klasę TextAnalyzer.
 """
@@ -25,18 +23,15 @@ class TextAnalyzer:
                                                      Np. ["parser", "ner"] jeśli nie są potrzebne.
         """
         try:
-            # Wyłączamy komponenty, jeśli użytkownik tego zażąda
             self.nlp = spacy.load(constants.SPACY_MODEL_PL, disable=disable_pipelines or [])
             self.nlp.max_length = constants.NLP_MAX_LENGTH
         except OSError:
-            # ... (obsługa błędu bez zmian) ...
             print(f"Błąd: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
             print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
             raise
         textstat.set_lang('pl_PL')
     def _preprocess(self, text: str) -> Tuple:
-        # Ta metoda jest teraz bardziej pomocnicza, doc jest przekazywany z zewnątrz
         text_lower = text.lower()
         words = text.split()
         words_lower = text_lower.split()
@@ -45,7 +40,7 @@ class TextAnalyzer:
         return text_lower, words, words_lower, lines, sentences
     def analyze(self, text: str) -> Dict[str, float]:
-        """Analizuje pojedynczy tekst. Dobre do testów, mniej wydajne dla wielu tekstów."""
         doc = self.nlp(text)
         return self._analyze_single_doc(text, doc)
@@ -77,10 +72,9 @@ class TextAnalyzer:
             Iterable[Dict[str, float]]: Generator zwracający słownik cech dla każdego tekstu.
         """
         # Używamy nlp.pipe, który jest generatorem i przetwarza teksty wsadowo
-        # as_tuples=True pozwala przekazać teksty razem z ich oryginalnym kontekstem (choć tu nie używamy)
         docs = self.nlp.pipe(texts, batch_size=batch_size)
         # Przetwarzamy każdy dokument z generatora
         for i, doc in enumerate(docs):
-            original_text = texts[i] # Potrzebujemy oryginalnego tekstu
             yield self._analyze_single_doc(original_text, doc)

 """
 Główny moduł biblioteki zawierający klasę TextAnalyzer.
 """
                                                      Np. ["parser", "ner"] jeśli nie są potrzebne.
         """
         try:
             self.nlp = spacy.load(constants.SPACY_MODEL_PL, disable=disable_pipelines or [])
             self.nlp.max_length = constants.NLP_MAX_LENGTH
         except OSError:
             print(f"Błąd: Nie znaleziono modelu spaCy '{constants.SPACY_MODEL_PL}'.")
             print(f"python -m spacy download {constants.SPACY_MODEL_PL}")
             raise
         textstat.set_lang('pl_PL')
     def _preprocess(self, text: str) -> Tuple:
         text_lower = text.lower()
         words = text.split()
         words_lower = text_lower.split()
         return text_lower, words, words_lower, lines, sentences
     def analyze(self, text: str) -> Dict[str, float]:
+        """Analizuje pojedynczy tekst"""
         doc = self.nlp(text)
         return self._analyze_single_doc(text, doc)
             Iterable[Dict[str, float]]: Generator zwracający słownik cech dla każdego tekstu.
         """
         # Używamy nlp.pipe, który jest generatorem i przetwarza teksty wsadowo
         docs = self.nlp.pipe(texts, batch_size=batch_size)
         # Przetwarzamy każdy dokument z generatora
         for i, doc in enumerate(docs):
+            original_text = texts[i]
             yield self._analyze_single_doc(original_text, doc)

text_analyzer/features/base_features.py CHANGED Viewed

@@ -84,20 +84,17 @@ def analyze_advanced_char_features(text: str) -> Dict[str, float]:
     word_freq = Counter(words_found)
     most_common = word_freq.most_common(10)
-    # Polish diacritics
     polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
     char_counts = Counter(text)
     diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
     letters_count = sum(1 for ch in text if ch.isalpha())
-    # Single char words
     single_chars = [w for w in words_found if len(w) == 1]
     single_char_freq = Counter(single_chars)
     top_3_single = single_char_freq.most_common(3)
     top_codes = [ord(w) for w, _ in top_3_single]
     while len(top_codes) < 3: top_codes.append(0)
-    # Encoding
     replacement_count = char_counts.get('\uFFFD', 0)
     not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
     replacement_ratio = safe_divide(replacement_count, total_chars)
@@ -143,7 +140,7 @@ def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, fl
     if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}
     digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
-    caps_count = sum(1 for w in words if w.isupper()) # Używamy 'words' z oryginalną wielkością
     return {
         'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
@@ -222,7 +219,7 @@ def analyze_line_content(lines: List[str]) -> Dict[str, float]:
     }
 def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
-    """Oblicza stosunek "lorem ipsum"."""
     count = text_lower.count('lorem ipsum')
     return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}

     word_freq = Counter(words_found)
     most_common = word_freq.most_common(10)
     polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
     char_counts = Counter(text)
     diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
     letters_count = sum(1 for ch in text if ch.isalpha())
     single_chars = [w for w in words_found if len(w) == 1]
     single_char_freq = Counter(single_chars)
     top_3_single = single_char_freq.most_common(3)
     top_codes = [ord(w) for w, _ in top_3_single]
     while len(top_codes) < 3: top_codes.append(0)
     replacement_count = char_counts.get('\uFFFD', 0)
     not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
     replacement_ratio = safe_divide(replacement_count, total_chars)
     if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}
     digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
+    caps_count = sum(1 for w in words if w.isupper())
     return {
         'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
     }
 def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
+    """Oblicza stosunek lorem ipsum"""
     count = text_lower.count('lorem ipsum')
     return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}

text_analyzer/features/linguistic_features.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# text_analyzer/features/linguistic_features.py
 """
 Moduł do ekstrakcji cech lingwistycznych i stylistycznych tekstu.
 """
@@ -45,7 +44,6 @@ def calculate_symbol_to_word_ratio(words: List[str], text: str) -> Dict[str, flo
     symbol_count = char_counts.get('#', 0) + triple_dot_count + char_counts.get('…', 0)
     return {'symbol_to_word_ratio': safe_divide(symbol_count, total_words)}
 # --- Funkcje analizujące n-gramy ---
 def calculate_ngram_fractions(words: List[str]) -> Dict[str, float]:
@@ -77,7 +75,6 @@ def calculate_ngram_fractions(words: List[str]) -> Dict[str, float]:
 # --- Funkcje analizujące styl tekstu ---
 def analyze_stylistic_metrics(text: str, words: List[str], sentences: List[str]) -> Dict[str, float]:
-    # Używamy prostej tokenizacji, aby zachować zgodność ze starym kodem
     sentences_from_regex = re.findall(r'[^.!?]+[.!?]', text)
     num_sentences = len(sentences_from_regex)
     words_per_sentence = [len(s.split()) for s in sentences_from_regex]
@@ -114,9 +111,9 @@ def calculate_all_linguistic_features(text: str, text_lower: str, words: List[st
     features.update(calculate_stop_word_ratio(words_lower))
     features.update(count_bad_words(words_lower))
     features.update(calculate_unigram_entropy(words_lower))
-    features.update(count_non_alpha_words(text)) # Oryginalnie liczone na całym tekście
     features.update(calculate_symbol_to_word_ratio(words, text))
     features.update(calculate_ngram_fractions(words))
     features.update(analyze_stylistic_metrics(text, words, sentences))
-    features['javascript_counts_per_line'] = text_lower.count('javascript') # Oryginał to była suma, nie na linię
     return features

 """
 Moduł do ekstrakcji cech lingwistycznych i stylistycznych tekstu.
 """
     symbol_count = char_counts.get('#', 0) + triple_dot_count + char_counts.get('…', 0)
     return {'symbol_to_word_ratio': safe_divide(symbol_count, total_words)}
 # --- Funkcje analizujące n-gramy ---
 def calculate_ngram_fractions(words: List[str]) -> Dict[str, float]:
 # --- Funkcje analizujące styl tekstu ---
 def analyze_stylistic_metrics(text: str, words: List[str], sentences: List[str]) -> Dict[str, float]:
     sentences_from_regex = re.findall(r'[^.!?]+[.!?]', text)
     num_sentences = len(sentences_from_regex)
     words_per_sentence = [len(s.split()) for s in sentences_from_regex]
     features.update(calculate_stop_word_ratio(words_lower))
     features.update(count_bad_words(words_lower))
     features.update(calculate_unigram_entropy(words_lower))
+    features.update(count_non_alpha_words(text))
     features.update(calculate_symbol_to_word_ratio(words, text))
     features.update(calculate_ngram_fractions(words))
     features.update(analyze_stylistic_metrics(text, words, sentences))
+    features['javascript_counts_per_line'] = text_lower.count('javascript')
     return features

text_analyzer/features/regex_features.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# text_analyzer/features/regex_features.py
 """
 Moduł do ekstrakcji cech opartych na wyrażeniach regularnych.
@@ -37,7 +36,6 @@ def calculate_all_regex_features(text: str) -> Dict[str, int]:
             matches = pattern.findall(text)
             features[name] = len(matches)
         except Exception as e:
-            # Dodajemy obsługę błędów na wypadek problemów z regexem
             print(f"Błąd podczas przetwarzania wzorca '{name}': {e}")
             features[name] = 0

 """
 Moduł do ekstrakcji cech opartych na wyrażeniach regularnych.
             matches = pattern.findall(text)
             features[name] = len(matches)
         except Exception as e:
             print(f"Błąd podczas przetwarzania wzorca '{name}': {e}")
             features[name] = 0

text_analyzer/features/structural_features.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# text_analyzer/features/structural_features.py
 """
 Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
 """

 """
 Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
 """