VinayNR
/

stats-nerd

Token Classification

Model card Files Files and versions

VinayNR commited on Apr 20, 2023

Commit

f5c1d3a

·

1 Parent(s): 8a8dcbd

Upload hypothesis.py

Files changed (1) hide show

hypothesis.py +98 -0

hypothesis.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import scipy.stats as stats
+def isfloat(text):
+    try:
+        # Attempt to convert the string to a float
+        fl = float(text)
+        return True
+    except ValueError:
+        return False
+def extract_p_val(text):
+    parts = text.split(' ')
+    return parts[-1], parts[-2]
+class HypothesisTest:
+    def __init__(self, text):
+        # split text to form the parts of the test reporting
+        # "t-test ( 37 ) = 1.414"
+        # initialization
+        self.test_type = ''
+        self.test_stat = 0.0
+        self.df1 = 0
+        self.tails = 1
+        parts = text.split(' ')
+        if parts[0].lower().startswith('t'):
+            self.test_type = 't'
+        elif parts[0].lower().startswith('z'):
+            self.test_type = 'z'
+        else:
+            raise Exception('Failed to parse the test')
+        for part in parts:
+            if isfloat(part):
+                if part.isdigit():
+                    self.df1 = part
+                else:
+                    self.test_stat = part
+    @property
+    def reported_p_val(self):
+        return self._reported_p_val
+    @reported_p_val.setter
+    def reported_p_val(self, value):
+        # Add any validation or processing logic here
+        self._reported_p_val = value
+    @property
+    def reported_p_val_dir(self):
+        return self._reported_p_val_dir
+    @reported_p_val_dir.setter
+    def reported_p_val_dir(self, dirn):
+        # Add any validation or processing logic here
+        if dirn in ['<', '>', '=']:
+            self._reported_p_val_dir = dirn
+        else:
+            print("The direction can be one of <, >, =")
+    def calculate_p_val(self):
+        if self.test_type == 't':
+            return self.tails*(1 - stats.t.cdf(abs(float(self.test_stat)), df=int(self.df1)))
+        elif self.test_type == 'z':
+            return self.tails*(1 - stats.norm.cdf(abs(float(self.test_stat))))
+    @staticmethod
+    def get_reported_stat_tests(sentence):
+        tests = []
+        # group tests with p values
+        labeled_entities = sentence.get_labels('ner')
+        for idx, entity in enumerate(labeled_entities):
+            if entity.value == 'T':
+                try:
+                    test = HypothesisTest(entity.data_point.text)
+                    # get the p-value closest to this span
+                    # assume this to be at the next index in the list
+                    p_val_span = labeled_entities[idx+1].data_point.text
+                    p_val, dirn = extract_p_val(p_val_span)
+                    test.reported_p_val = p_val
+                    test.reported_p_val_dir = dirn
+                    tests.append(test)
+                except:
+#                     print('Not a test!')
+                    pass
+        return tests
+    def __str__(self):
+        return "Test Type : " + self.test_type + " | " + "Test Stat : " + str(self.test_stat) + " | " + "DF : " + str(self.df1) + " | " + "Rep P-val : " + str(self._reported_p_val_dir) + str(self.reported_p_val)