bstraehle commited on
Commit
812fce1
·
verified ·
1 Parent(s): 47f7668

Create utils/utils.py

Browse files
Files changed (1) hide show
  1. utils/utils.py +70 -0
utils/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from docx import Document
4
+ from pptx import Presentation
5
+
6
+ def get_questions(file_path, level):
7
+ df = pd.read_json(file_path, lines=True)
8
+
9
+ if level > 0:
10
+ df = df[df["Level"] == level]
11
+
12
+ result=[]
13
+
14
+ for index, row in df.iterrows():
15
+ result.append([row["Question"], row["Level"], row["Final answer"], row["file_name"]])
16
+
17
+ return result
18
+
19
+ def is_ext(file_path, ext):
20
+ return os.path.splitext(file_path)[1].lower() == ext.lower()
21
+
22
+ def read_file_json(file_path):
23
+ df = None
24
+
25
+ if is_ext(file_path, ".csv"):
26
+ df = pd.read_csv(file_path)
27
+ elif is_ext(file_path, ".xls") or is_ext(file_path, ".xlsx"):
28
+ df = pd.read_excel(file_path)
29
+ elif is_ext(file_path, ".json") or is_ext(file_path, ".jsonl"):
30
+ df = pd.read_json(file_path)
31
+
32
+ return "" if df is None else df.to_json()
33
+
34
+ def read_docx_text(file_path):
35
+ doc = Document(file_path)
36
+
37
+ text = []
38
+
39
+ for block in doc.element.body:
40
+ if block.tag.endswith("p"):
41
+ for paragraph in doc.paragraphs:
42
+ if paragraph._element == block:
43
+ if paragraph.style.name.startswith("Heading"):
44
+ text.append("\n**" + paragraph.text + "**\n")
45
+ elif paragraph.text:
46
+ text.append(paragraph.text)
47
+ elif block.tag.endswith("tbl"):
48
+ for table in doc.tables:
49
+ if table._element == block:
50
+ for row in table.rows:
51
+ row_text = []
52
+ for cell in row.cells:
53
+ row_text.append(cell.text.strip())
54
+ text.append(" | ".join(row_text))
55
+
56
+ return "\n".join(text)
57
+
58
+ def read_pptx_text(file_path):
59
+ prs = Presentation(file_path)
60
+
61
+ text = []
62
+
63
+ for slide in prs.slides:
64
+ slide_text = []
65
+ for shape in slide.shapes:
66
+ if hasattr(shape, "text"):
67
+ slide_text.append(shape.text)
68
+ text.append("\n".join(slide_text))
69
+
70
+ return "\n\n".join(text)