rcai commited on
Commit
748822f
·
verified ·
1 Parent(s): 8ba3f58

Update test.py

Browse files
Files changed (1) hide show
  1. test.py +29 -25
test.py CHANGED
@@ -3,31 +3,35 @@ import re
3
 
4
  import pandas as pd
5
 
6
- # Assuming you have a DataFrame named merge_data
7
- # Example:
8
- # merge_data = pd.read_csv('your_data.csv') # load your data
9
-
10
- # List of columns to compare (adjust based on your actual DataFrame)
11
- columns_to_compare = [
12
- "Biomarker Name Source", "Biomarker Test Type", "Biomarker Test Result Source",
13
- "Protein Alteration (p Notation Source)", "DNA Alteration (c notation Source)",
14
- "Biomarker Variant Type", "Biomarker Test Result", "Protein Alteration (p Notation Source)",
15
- "DNA Alteration (c notation Source)"
16
- ]
17
-
18
- # Calculate differences
19
- differences = {}
20
-
21
- for col in columns_to_compare:
22
- data_col = f"{col}_data"
23
- gt_col = f"{col}_gt"
24
- differences[f"{col}_diff"] = merge_data[data_col] != merge_data[gt_col]
25
-
26
- # Create a DataFrame to show differences
27
- diff_df = pd.DataFrame(differences)
28
-
29
- # Show the resulting differences
30
- diff_df
 
 
 
 
31
 
32
 
33
 
 
3
 
4
  import pandas as pd
5
 
6
+ # Load your data (replace 'your_file.csv' with the actual file path)
7
+ merge_data = pd.read_csv('your_file.csv')
8
+
9
+ # Calculate value counts for columns ending with _data and _gt
10
+ columns_data = [col for col in merge_data.columns if col.endswith('_data')]
11
+ columns_gt = [col for col in merge_data.columns if col.endswith('_gt')]
12
+
13
+ # Initialize a dictionary to store value counts and differences
14
+ value_counts_diff = {}
15
+
16
+ for data_col, gt_col in zip(columns_data, columns_gt):
17
+ data_counts = merge_data[data_col].value_counts(dropna=False)
18
+ gt_counts = merge_data[gt_col].value_counts(dropna=False)
19
+
20
+ # Create a DataFrame combining the counts
21
+ combined_counts = pd.DataFrame({
22
+ 'data_counts': data_counts,
23
+ 'gt_counts': gt_counts
24
+ }).fillna(0)
25
+
26
+ # Calculate the difference between data and gt counts
27
+ combined_counts['difference'] = combined_counts['data_counts'] - combined_counts['gt_counts']
28
+
29
+ # Store in dictionary
30
+ value_counts_diff[data_col] = combined_counts
31
+
32
+ # Display the results for each column
33
+ value_counts_diff
34
+
35
 
36
 
37