Ravi Raj commited on
Commit
9ca244a
·
1 Parent(s): 71323e2

Add AI-generated attachment chunks and rebuild embeddings

Browse files
data/add_fake_attachments.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import sys
4
+
5
+ # --- ensure project root is on sys.path so `email_rag` is importable ---
6
+ ROOT_DIR = Path(__file__).resolve().parents[1]
7
+ if str(ROOT_DIR) not in sys.path:
8
+ sys.path.insert(0, str(ROOT_DIR))
9
+
10
+ from email_rag.rag_config import DATA_DIR, CHUNKS_PATH, CHUNK_IDS_PATH
11
+
12
+ FAKE_ATTACHMENTS_PATH = DATA_DIR / "fake_attachments.jsonl"
13
+
14
+
15
+ def load_jsonl(path: Path):
16
+ rows = []
17
+ with path.open("r", encoding="utf-8") as f:
18
+ for line in f:
19
+ line = line.strip()
20
+ if not line:
21
+ continue
22
+ rows.append(json.loads(line))
23
+ return rows
24
+
25
+
26
+ def save_jsonl(path: Path, rows):
27
+ with path.open("w", encoding="utf-8") as f:
28
+ for r in rows:
29
+ f.write(json.dumps(r) + "\n")
30
+
31
+
32
+ def main():
33
+ print(f"Loading chunks from {CHUNKS_PATH} ...")
34
+ chunks = []
35
+ with CHUNKS_PATH.open("r", encoding="utf-8") as f:
36
+ for line in f:
37
+ line = line.strip()
38
+ if not line:
39
+ continue
40
+ chunks.append(json.loads(line))
41
+
42
+ existing_ids = {c["chunk_id"] for c in chunks}
43
+
44
+ print(f"Loading fake attachments from {FAKE_ATTACHMENTS_PATH} ...")
45
+ attachments = load_jsonl(FAKE_ATTACHMENTS_PATH)
46
+
47
+ new_chunks = []
48
+ for att in attachments:
49
+ thread_id = att["thread_id"]
50
+ message_id = att["message_id"]
51
+ page_no = att.get("page_no", 1)
52
+ filename = att.get("filename")
53
+
54
+ # unique id for each attachment page
55
+ chunk_id = att.get("chunk_id") or f"att_{message_id}_p{page_no}"
56
+ if chunk_id in existing_ids:
57
+ print(f"Skipping duplicate attachment chunk_id={chunk_id}")
58
+ continue
59
+
60
+ rec = {
61
+ "chunk_id": chunk_id,
62
+ "thread_id": thread_id,
63
+ "message_id": message_id,
64
+ "page_no": page_no,
65
+ "source": att.get("source", "attachment"),
66
+ "text": att["text"],
67
+ }
68
+ if filename:
69
+ rec["filename"] = filename
70
+
71
+ new_chunks.append(rec)
72
+ existing_ids.add(chunk_id)
73
+
74
+ if not new_chunks:
75
+ print("No new attachment chunks to add.")
76
+ return
77
+
78
+ print(f"Adding {len(new_chunks)} attachment chunks …")
79
+ chunks.extend(new_chunks)
80
+
81
+ save_jsonl(CHUNKS_PATH, chunks)
82
+ print(f"Saved updated chunks to {CHUNKS_PATH}")
83
+
84
+ # regenerate chunk_ids.json
85
+ all_ids = [c["chunk_id"] for c in chunks]
86
+ with CHUNK_IDS_PATH.open("w", encoding="utf-8") as f:
87
+ json.dump(all_ids, f)
88
+ print(f"Updated chunk_ids.json at {CHUNK_IDS_PATH}")
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
data/build_embeddings.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data/build_embeddings.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+ import sys
6
+
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ # --- ensure project root is on sys.path so `email_rag` is importable ---
11
+ ROOT_DIR = Path(__file__).resolve().parents[1] # parent of `data/`
12
+ if str(ROOT_DIR) not in sys.path:
13
+ sys.path.insert(0, str(ROOT_DIR))
14
+
15
+ from email_rag.rag_config import CHUNKS_PATH, CHUNK_IDS_PATH, EMBEDDINGS_PATH
16
+
17
+
18
+ def load_jsonl(path: Path):
19
+ rows = []
20
+ with path.open("r", encoding="utf-8") as f:
21
+ for line in f:
22
+ line = line.strip()
23
+ if not line:
24
+ continue
25
+ rows.append(json.loads(line))
26
+ return rows
27
+
28
+
29
+ def main():
30
+ print(f"Loading chunks from {CHUNKS_PATH} ...")
31
+ chunks = load_jsonl(CHUNKS_PATH)
32
+ texts = [c["text"] for c in chunks]
33
+ chunk_ids = [c["chunk_id"] for c in chunks]
34
+
35
+ print(f"Total chunks: {len(chunks)}")
36
+
37
+ print("Loading sentence-transformers model ...")
38
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
39
+
40
+ print("Encoding chunks ...")
41
+ embeddings = model.encode(
42
+ texts,
43
+ normalize_embeddings=True,
44
+ show_progress_bar=True,
45
+ )
46
+
47
+ embeddings = embeddings.astype("float32")
48
+
49
+ print(f"Saving embeddings to {EMBEDDINGS_PATH} ...")
50
+ np.save(EMBEDDINGS_PATH, embeddings)
51
+
52
+ print(f"Saving chunk IDs to {CHUNK_IDS_PATH} ...")
53
+ with CHUNK_IDS_PATH.open("w", encoding="utf-8") as f:
54
+ json.dump(chunk_ids, f)
55
+
56
+ print("Done.")
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
data/chunk_ids.json CHANGED
@@ -1,209 +1 @@
1
- [
2
- "chunk_M-000001",
3
- "chunk_M-000002",
4
- "chunk_M-000003",
5
- "chunk_M-000004",
6
- "chunk_M-000005",
7
- "chunk_M-000006",
8
- "chunk_M-000007",
9
- "chunk_M-000008",
10
- "chunk_M-000009",
11
- "chunk_M-000010",
12
- "chunk_M-000011",
13
- "chunk_M-000012",
14
- "chunk_M-000013",
15
- "chunk_M-000014",
16
- "chunk_M-000015",
17
- "chunk_M-000016",
18
- "chunk_M-000017",
19
- "chunk_M-000018",
20
- "chunk_M-000019",
21
- "chunk_M-000020",
22
- "chunk_M-000021",
23
- "chunk_M-000022",
24
- "chunk_M-000023",
25
- "chunk_M-000024",
26
- "chunk_M-000025",
27
- "chunk_M-000026",
28
- "chunk_M-000055",
29
- "chunk_M-000056",
30
- "chunk_M-000059",
31
- "chunk_M-000060",
32
- "chunk_M-000071",
33
- "chunk_M-000072",
34
- "chunk_M-000073",
35
- "chunk_M-000027",
36
- "chunk_M-000028",
37
- "chunk_M-000029",
38
- "chunk_M-000030",
39
- "chunk_M-000031",
40
- "chunk_M-000032",
41
- "chunk_M-000085",
42
- "chunk_M-000086",
43
- "chunk_M-000087",
44
- "chunk_M-000088",
45
- "chunk_M-000089",
46
- "chunk_M-000090",
47
- "chunk_M-000102",
48
- "chunk_M-000103",
49
- "chunk_M-000104",
50
- "chunk_M-000105",
51
- "chunk_M-000106",
52
- "chunk_M-000107",
53
- "chunk_M-000108",
54
- "chunk_M-000109",
55
- "chunk_M-000110",
56
- "chunk_M-000111",
57
- "chunk_M-000112",
58
- "chunk_M-000113",
59
- "chunk_M-000033",
60
- "chunk_M-000034",
61
- "chunk_M-000035",
62
- "chunk_M-000036",
63
- "chunk_M-000037",
64
- "chunk_M-000038",
65
- "chunk_M-000039",
66
- "chunk_M-000040",
67
- "chunk_M-000041",
68
- "chunk_M-000042",
69
- "chunk_M-000043",
70
- "chunk_M-000044",
71
- "chunk_M-000045",
72
- "chunk_M-000046",
73
- "chunk_M-000047",
74
- "chunk_M-000048",
75
- "chunk_M-000049",
76
- "chunk_M-000050",
77
- "chunk_M-000051",
78
- "chunk_M-000052",
79
- "chunk_M-000074",
80
- "chunk_M-000075",
81
- "chunk_M-000076",
82
- "chunk_M-000077",
83
- "chunk_M-000053",
84
- "chunk_M-000054",
85
- "chunk_M-000057",
86
- "chunk_M-000058",
87
- "chunk_M-000061",
88
- "chunk_M-000062",
89
- "chunk_M-000126",
90
- "chunk_M-000127",
91
- "chunk_M-000128",
92
- "chunk_M-000129",
93
- "chunk_M-000130",
94
- "chunk_M-000131",
95
- "chunk_M-000171",
96
- "chunk_M-000172",
97
- "chunk_M-000063",
98
- "chunk_M-000064",
99
- "chunk_M-000069",
100
- "chunk_M-000070",
101
- "chunk_M-000114",
102
- "chunk_M-000115",
103
- "chunk_M-000136",
104
- "chunk_M-000137",
105
- "chunk_M-000142",
106
- "chunk_M-000143",
107
- "chunk_M-000144",
108
- "chunk_M-000145",
109
- "chunk_M-000065",
110
- "chunk_M-000066",
111
- "chunk_M-000067",
112
- "chunk_M-000068",
113
- "chunk_M-000081",
114
- "chunk_M-000082",
115
- "chunk_M-000083",
116
- "chunk_M-000084",
117
- "chunk_M-000132",
118
- "chunk_M-000133",
119
- "chunk_M-000134",
120
- "chunk_M-000135",
121
- "chunk_M-000138",
122
- "chunk_M-000139",
123
- "chunk_M-000140",
124
- "chunk_M-000141",
125
- "chunk_M-000078",
126
- "chunk_M-000079",
127
- "chunk_M-000080",
128
- "chunk_M-000116",
129
- "chunk_M-000117",
130
- "chunk_M-000118",
131
- "chunk_M-000150",
132
- "chunk_M-000151",
133
- "chunk_M-000152",
134
- "chunk_M-000153",
135
- "chunk_M-000154",
136
- "chunk_M-000155",
137
- "chunk_M-000156",
138
- "chunk_M-000157",
139
- "chunk_M-000158",
140
- "chunk_M-000159",
141
- "chunk_M-000091",
142
- "chunk_M-000092",
143
- "chunk_M-000093",
144
- "chunk_M-000094",
145
- "chunk_M-000095",
146
- "chunk_M-000096",
147
- "chunk_M-000097",
148
- "chunk_M-000098",
149
- "chunk_M-000099",
150
- "chunk_M-000100",
151
- "chunk_M-000101",
152
- "chunk_M-000119",
153
- "chunk_M-000120",
154
- "chunk_M-000121",
155
- "chunk_M-000122",
156
- "chunk_M-000123",
157
- "chunk_M-000124",
158
- "chunk_M-000125",
159
- "chunk_M-000146",
160
- "chunk_M-000147",
161
- "chunk_M-000148",
162
- "chunk_M-000149",
163
- "chunk_M-000206",
164
- "chunk_M-000207",
165
- "chunk_M-000160",
166
- "chunk_M-000161",
167
- "chunk_M-000162",
168
- "chunk_M-000163",
169
- "chunk_M-000164",
170
- "chunk_M-000165",
171
- "chunk_M-000166",
172
- "chunk_M-000167",
173
- "chunk_M-000168",
174
- "chunk_M-000169",
175
- "chunk_M-000170",
176
- "chunk_M-000173",
177
- "chunk_M-000174",
178
- "chunk_M-000175",
179
- "chunk_M-000176",
180
- "chunk_M-000177",
181
- "chunk_M-000178",
182
- "chunk_M-000179",
183
- "chunk_M-000180",
184
- "chunk_M-000181",
185
- "chunk_M-000182",
186
- "chunk_M-000183",
187
- "chunk_M-000184",
188
- "chunk_M-000185",
189
- "chunk_M-000186",
190
- "chunk_M-000187",
191
- "chunk_M-000188",
192
- "chunk_M-000189",
193
- "chunk_M-000190",
194
- "chunk_M-000191",
195
- "chunk_M-000192",
196
- "chunk_M-000193",
197
- "chunk_M-000194",
198
- "chunk_M-000195",
199
- "chunk_M-000196",
200
- "chunk_M-000197",
201
- "chunk_M-000198",
202
- "chunk_M-000199",
203
- "chunk_M-000200",
204
- "chunk_M-000201",
205
- "chunk_M-000202",
206
- "chunk_M-000203",
207
- "chunk_M-000204",
208
- "chunk_M-000205"
209
- ]
 
1
+ ["chunk_M-000001", "chunk_M-000002", "chunk_M-000003", "chunk_M-000004", "chunk_M-000005", "chunk_M-000006", "chunk_M-000007", "chunk_M-000008", "chunk_M-000009", "chunk_M-000010", "chunk_M-000011", "chunk_M-000012", "chunk_M-000013", "chunk_M-000014", "chunk_M-000015", "chunk_M-000016", "chunk_M-000017", "chunk_M-000018", "chunk_M-000019", "chunk_M-000020", "chunk_M-000021", "chunk_M-000022", "chunk_M-000023", "chunk_M-000024", "chunk_M-000025", "chunk_M-000026", "chunk_M-000055", "chunk_M-000056", "chunk_M-000059", "chunk_M-000060", "chunk_M-000071", "chunk_M-000072", "chunk_M-000073", "chunk_M-000027", "chunk_M-000028", "chunk_M-000029", "chunk_M-000030", "chunk_M-000031", "chunk_M-000032", "chunk_M-000085", "chunk_M-000086", "chunk_M-000087", "chunk_M-000088", "chunk_M-000089", "chunk_M-000090", "chunk_M-000102", "chunk_M-000103", "chunk_M-000104", "chunk_M-000105", "chunk_M-000106", "chunk_M-000107", "chunk_M-000108", "chunk_M-000109", "chunk_M-000110", "chunk_M-000111", "chunk_M-000112", "chunk_M-000113", "chunk_M-000033", "chunk_M-000034", "chunk_M-000035", "chunk_M-000036", "chunk_M-000037", "chunk_M-000038", "chunk_M-000039", "chunk_M-000040", "chunk_M-000041", "chunk_M-000042", "chunk_M-000043", "chunk_M-000044", "chunk_M-000045", "chunk_M-000046", "chunk_M-000047", "chunk_M-000048", "chunk_M-000049", "chunk_M-000050", "chunk_M-000051", "chunk_M-000052", "chunk_M-000074", "chunk_M-000075", "chunk_M-000076", "chunk_M-000077", "chunk_M-000053", "chunk_M-000054", "chunk_M-000057", "chunk_M-000058", "chunk_M-000061", "chunk_M-000062", "chunk_M-000126", "chunk_M-000127", "chunk_M-000128", "chunk_M-000129", "chunk_M-000130", "chunk_M-000131", "chunk_M-000171", "chunk_M-000172", "chunk_M-000063", "chunk_M-000064", "chunk_M-000069", "chunk_M-000070", "chunk_M-000114", "chunk_M-000115", "chunk_M-000136", "chunk_M-000137", "chunk_M-000142", "chunk_M-000143", "chunk_M-000144", "chunk_M-000145", "chunk_M-000065", "chunk_M-000066", "chunk_M-000067", "chunk_M-000068", "chunk_M-000081", "chunk_M-000082", "chunk_M-000083", "chunk_M-000084", "chunk_M-000132", "chunk_M-000133", "chunk_M-000134", "chunk_M-000135", "chunk_M-000138", "chunk_M-000139", "chunk_M-000140", "chunk_M-000141", "chunk_M-000078", "chunk_M-000079", "chunk_M-000080", "chunk_M-000116", "chunk_M-000117", "chunk_M-000118", "chunk_M-000150", "chunk_M-000151", "chunk_M-000152", "chunk_M-000153", "chunk_M-000154", "chunk_M-000155", "chunk_M-000156", "chunk_M-000157", "chunk_M-000158", "chunk_M-000159", "chunk_M-000091", "chunk_M-000092", "chunk_M-000093", "chunk_M-000094", "chunk_M-000095", "chunk_M-000096", "chunk_M-000097", "chunk_M-000098", "chunk_M-000099", "chunk_M-000100", "chunk_M-000101", "chunk_M-000119", "chunk_M-000120", "chunk_M-000121", "chunk_M-000122", "chunk_M-000123", "chunk_M-000124", "chunk_M-000125", "chunk_M-000146", "chunk_M-000147", "chunk_M-000148", "chunk_M-000149", "chunk_M-000206", "chunk_M-000207", "chunk_M-000160", "chunk_M-000161", "chunk_M-000162", "chunk_M-000163", "chunk_M-000164", "chunk_M-000165", "chunk_M-000166", "chunk_M-000167", "chunk_M-000168", "chunk_M-000169", "chunk_M-000170", "chunk_M-000173", "chunk_M-000174", "chunk_M-000175", "chunk_M-000176", "chunk_M-000177", "chunk_M-000178", "chunk_M-000179", "chunk_M-000180", "chunk_M-000181", "chunk_M-000182", "chunk_M-000183", "chunk_M-000184", "chunk_M-000185", "chunk_M-000186", "chunk_M-000187", "chunk_M-000188", "chunk_M-000189", "chunk_M-000190", "chunk_M-000191", "chunk_M-000192", "chunk_M-000193", "chunk_M-000194", "chunk_M-000195", "chunk_M-000196", "chunk_M-000197", "chunk_M-000198", "chunk_M-000199", "chunk_M-000200", "chunk_M-000201", "chunk_M-000202", "chunk_M-000203", "chunk_M-000204", "chunk_M-000205", "att_M-000206_p1", "att_M-000207_p1", "att_M-000207_p2"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/chunks.jsonl CHANGED
@@ -205,3 +205,6 @@
205
  {"chunk_id": "chunk_M-000203", "source": "email", "thread_id": "T-0015", "message_id": "M-000203", "page_no": null, "text": "yes\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:03:17 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nis constellation energy a goldman company?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:55 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nme thinks you missed a 9\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:58:28 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\n1.99999999999999999999999999999999999\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:37 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nrules to a game:\nYou flip a coin. If you get tails you win 0. if you get heads, i give you\n$1. Keep flipping until you get a tails, at which point you walk away with\nthe money. however, each heads you get after the first you double your\nmoney. So if you flip heads 3 times and then tails, you get $4. What's\nyou bid/offer on playing this game? (would you pay $.5 to play? $1? $2?\nwhat you charge me play against you?)\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 08:57:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
206
  {"chunk_id": "chunk_M-000204", "source": "email", "thread_id": "T-0015", "message_id": "M-000204", "page_no": null, "text": "i think it's 100\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:01:23 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nfill in\n$ ____2_ per/share\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:54 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nfill in\n$ _____ per/share\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:39:23 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\nwhat's my bid for what??\nps\ndon't no crap me.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:30 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nno crap, what's your bid?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 12:48:23 AM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: Extra credit\n\n\nbreak even on info ass-symetry is 100%, any project above that level is\nprofitable to Pooks&Co.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 1:04 AM\nTo: epao@mba2002.hbs.edu\nSubject: RE: try this one...\n\n\n\nFor extra credit....\nIf the company is worth 150% more under management A rather than 50% more,\ndoes your answer change?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/11/2001 05:13:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: try this one...\n\n\nwill you do all of my homework?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Friday, May 11, 2001 8:41 AM\nTo: epao@mba2002.hbs.edu\nSubject: Re: try this one...\n\n\n\ni'll pay a grand total of 0\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/10/2001 05:15:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <jarnold@enron.com>\ncc:\nSubject: try this one...\n\n\n Please read the following problem very carefully, and write in a number at\nthe end. You should be ready to defend your answer. Only a number is\nallowed, not an algebraic equation.\n\nAcquiring a Company\n\n In the following exercise you will represent Company A (the acquirer),\nwhich is currently considering acquiring Company T (the target) by means of\na tender offer. You plan to tender in cash for 100% of Company T's shares\nbut are unsure how high a price to offer. The main complication is this:\nthe value of Company T depends directly on the outcome of a major oil\nexploration project it is currently undertaking. Indeed, the very\nviability\nof Company T depends on the exploration outcome. If the project fails, the\ncompany under current management will be worth nothing--$0/share. But if\nthe project succeeds, the value of the company under current management\ncould be as high as $100/share. All share values between $0 and $100 are\nconsidered equally likely. By all estimates, the company will be worth\nconsiderably more in the hands of Company A than under current management.\nIn fact, whatever the ultimate value under current management, the company\nwill be worth fifty percent more under the management of A than under\nCompany T. If the project fails, the company is worth $0/share under\neither\nmanagement. If the exploration project generates a $50/share value under\ncurrent management, the value under Company A is $75/share. Similarly, a\n$100/share value under Company T implies a $150/share value under Company\nA,\nand so on.\n\n The board of directors of Company A has asked you to determine the\nprice\nthey should offer for Company T's shares. This offer must be made now,\nbefore the outcome of the drilling project is known. From all indications,\nCompany T would be happy to be acquired by Company A, provided it is at a\nprofitable price. Moreover, Company T wishes to avoid, at all cost, the\npotential of a takeover bid by any other firm. You expect Company T to\ndelay a decision on your bid until the results of the project are in, then\naccept or reject your offer before the news of the drilling results reaches\nthe press.\n\n Thus, you (Company A) will not know the results of the exploration\nproject\nwhen submitting your price offer, but Company T will know the results when\ndeciding whether or not to accept your offer. In addition, Company T will\naccept any offer by Company A that is greater than the (per share) value of\nthe company under current management. Thus, if you offer $50/share, for\ninstance, Company T will accept if the value of the company to Company T is\nanything less than $50.\n\n As the representative of Company A, you are deliberating over price\noffers\nin the range of $0/share (this is tantamount to making no offer at all) to\n$150/share. What price offer per share would you tender for Company T's\nstock?\n\n $______ per/share\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 09:16:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
207
  {"chunk_id": "chunk_M-000205", "source": "email", "thread_id": "T-0015", "message_id": "M-000205", "page_no": null, "text": "i think it's 100\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:01:23 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nfill in\n$ ____2_ per/share\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:54 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nfill in\n$ _____ per/share\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:39:23 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\nwhat's my bid for what??\nps\ndon't no crap me.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:30 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nno crap, what's your bid?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 12:48:23 AM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: Extra credit\n\n\nbreak even on info ass-symetry is 100%, any project above that level is\nprofitable to Pooks&Co.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 1:04 AM\nTo: epao@mba2002.hbs.edu\nSubject: RE: try this one...\n\n\n\nFor extra credit....\nIf the company is worth 150% more under management A rather than 50% more,\ndoes your answer change?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/11/2001 05:13:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: try this one...\n\n\nwill you do all of my homework?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Friday, May 11, 2001 8:41 AM\nTo: epao@mba2002.hbs.edu\nSubject: Re: try this one...\n\n\n\ni'll pay a grand total of 0\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/10/2001 05:15:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <jarnold@enron.com>\ncc:\nSubject: try this one...\n\n\n Please read the following problem very carefully, and write in a number at\nthe end. You should be ready to defend your answer. Only a number is\nallowed, not an algebraic equation.\n\nAcquiring a Company\n\n In the following exercise you will represent Company A (the acquirer),\nwhich is currently considering acquiring Company T (the target) by means of\na tender offer. You plan to tender in cash for 100% of Company T's shares\nbut are unsure how high a price to offer. The main complication is this:\nthe value of Company T depends directly on the outcome of a major oil\nexploration project it is currently undertaking. Indeed, the very\nviability\nof Company T depends on the exploration outcome. If the project fails, the\ncompany under current management will be worth nothing--$0/share. But if\nthe project succeeds, the value of the company under current management\ncould be as high as $100/share. All share values between $0 and $100 are\nconsidered equally likely. By all estimates, the company will be worth\nconsiderably more in the hands of Company A than under current management.\nIn fact, whatever the ultimate value under current management, the company\nwill be worth fifty percent more under the management of A than under\nCompany T. If the project fails, the company is worth $0/share under\neither\nmanagement. If the exploration project generates a $50/share value under\ncurrent management, the value under Company A is $75/share. Similarly, a\n$100/share value under Company T implies a $150/share value under Company\nA,\nand so on.\n\n The board of directors of Company A has asked you to determine the\nprice\nthey should offer for Company T's shares. This offer must be made now,\nbefore the outcome of the drilling project is known. From all indications,\nCompany T would be happy to be acquired by Company A, provided it is at a\nprofitable price. Moreover, Company T wishes to avoid, at all cost, the\npotential of a takeover bid by any other firm. You expect Company T to\ndelay a decision on your bid until the results of the project are in, then\naccept or reject your offer before the news of the drilling results reaches\nthe press.\n\n Thus, you (Company A) will not know the results of the exploration\nproject\nwhen submitting your price offer, but Company T will know the results when\ndeciding whether or not to accept your offer. In addition, Company T will\naccept any offer by Company A that is greater than the (per share) value of\nthe company under current management. Thus, if you offer $50/share, for\ninstance, Company T will accept if the value of the company to Company T is\nanything less than $50.\n\n As the representative of Company A, you are deliberating over price\noffers\nin the range of $0/share (this is tantamount to making no offer at all) to\n$150/share. What price offer per share would you tender for Company T's\nstock?\n\n $______ per/share\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 09:16:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
 
 
 
 
205
  {"chunk_id": "chunk_M-000203", "source": "email", "thread_id": "T-0015", "message_id": "M-000203", "page_no": null, "text": "yes\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:03:17 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nis constellation energy a goldman company?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:55 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nme thinks you missed a 9\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:58:28 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\n1.99999999999999999999999999999999999\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:37 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nrules to a game:\nYou flip a coin. If you get tails you win 0. if you get heads, i give you\n$1. Keep flipping until you get a tails, at which point you walk away with\nthe money. however, each heads you get after the first you double your\nmoney. So if you flip heads 3 times and then tails, you get $4. What's\nyou bid/offer on playing this game? (would you pay $.5 to play? $1? $2?\nwhat you charge me play against you?)\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 08:57:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
206
  {"chunk_id": "chunk_M-000204", "source": "email", "thread_id": "T-0015", "message_id": "M-000204", "page_no": null, "text": "i think it's 100\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:01:23 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nfill in\n$ ____2_ per/share\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:54 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nfill in\n$ _____ per/share\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:39:23 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\nwhat's my bid for what??\nps\ndon't no crap me.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:30 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nno crap, what's your bid?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 12:48:23 AM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: Extra credit\n\n\nbreak even on info ass-symetry is 100%, any project above that level is\nprofitable to Pooks&Co.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 1:04 AM\nTo: epao@mba2002.hbs.edu\nSubject: RE: try this one...\n\n\n\nFor extra credit....\nIf the company is worth 150% more under management A rather than 50% more,\ndoes your answer change?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/11/2001 05:13:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: try this one...\n\n\nwill you do all of my homework?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Friday, May 11, 2001 8:41 AM\nTo: epao@mba2002.hbs.edu\nSubject: Re: try this one...\n\n\n\ni'll pay a grand total of 0\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/10/2001 05:15:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <jarnold@enron.com>\ncc:\nSubject: try this one...\n\n\n Please read the following problem very carefully, and write in a number at\nthe end. You should be ready to defend your answer. Only a number is\nallowed, not an algebraic equation.\n\nAcquiring a Company\n\n In the following exercise you will represent Company A (the acquirer),\nwhich is currently considering acquiring Company T (the target) by means of\na tender offer. You plan to tender in cash for 100% of Company T's shares\nbut are unsure how high a price to offer. The main complication is this:\nthe value of Company T depends directly on the outcome of a major oil\nexploration project it is currently undertaking. Indeed, the very\nviability\nof Company T depends on the exploration outcome. If the project fails, the\ncompany under current management will be worth nothing--$0/share. But if\nthe project succeeds, the value of the company under current management\ncould be as high as $100/share. All share values between $0 and $100 are\nconsidered equally likely. By all estimates, the company will be worth\nconsiderably more in the hands of Company A than under current management.\nIn fact, whatever the ultimate value under current management, the company\nwill be worth fifty percent more under the management of A than under\nCompany T. If the project fails, the company is worth $0/share under\neither\nmanagement. If the exploration project generates a $50/share value under\ncurrent management, the value under Company A is $75/share. Similarly, a\n$100/share value under Company T implies a $150/share value under Company\nA,\nand so on.\n\n The board of directors of Company A has asked you to determine the\nprice\nthey should offer for Company T's shares. This offer must be made now,\nbefore the outcome of the drilling project is known. From all indications,\nCompany T would be happy to be acquired by Company A, provided it is at a\nprofitable price. Moreover, Company T wishes to avoid, at all cost, the\npotential of a takeover bid by any other firm. You expect Company T to\ndelay a decision on your bid until the results of the project are in, then\naccept or reject your offer before the news of the drilling results reaches\nthe press.\n\n Thus, you (Company A) will not know the results of the exploration\nproject\nwhen submitting your price offer, but Company T will know the results when\ndeciding whether or not to accept your offer. In addition, Company T will\naccept any offer by Company A that is greater than the (per share) value of\nthe company under current management. Thus, if you offer $50/share, for\ninstance, Company T will accept if the value of the company to Company T is\nanything less than $50.\n\n As the representative of Company A, you are deliberating over price\noffers\nin the range of $0/share (this is tantamount to making no offer at all) to\n$150/share. What price offer per share would you tender for Company T's\nstock?\n\n $______ per/share\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 09:16:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
207
  {"chunk_id": "chunk_M-000205", "source": "email", "thread_id": "T-0015", "message_id": "M-000205", "page_no": null, "text": "i think it's 100\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 03:01:23 PM\nPlease respond to <epao@mba2002.hbs.edu>\nTo: <John.Arnold@enron.com>\ncc: \nSubject: RE: Extra credit\n\n\nfill in\n$ ____2_ per/share\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:54 PM\nTo: epao@mba2002.hbs.edu\nSubject: RE: Extra credit\n\n\n\nfill in\n$ _____ per/share\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 02:39:23 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: Extra credit\n\n\nwhat's my bid for what??\nps\ndon't no crap me.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 4:30 PM\nTo: epao@mba2002.hbs.edu\nSubject: Re: Extra credit\n\n\n\nno crap, what's your bid?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/13/2001 12:48:23 AM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: Extra credit\n\n\nbreak even on info ass-symetry is 100%, any project above that level is\nprofitable to Pooks&Co.\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Sunday, May 13, 2001 1:04 AM\nTo: epao@mba2002.hbs.edu\nSubject: RE: try this one...\n\n\n\nFor extra credit....\nIf the company is worth 150% more under management A rather than 50% more,\ndoes your answer change?\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/11/2001 05:13:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <John.Arnold@enron.com>\ncc:\nSubject: RE: try this one...\n\n\nwill you do all of my homework?\n\n-----Original Message-----\nFrom: John.Arnold@enron.com [mailto:John.Arnold@enron.com]\nSent: Friday, May 11, 2001 8:41 AM\nTo: epao@mba2002.hbs.edu\nSubject: Re: try this one...\n\n\n\ni'll pay a grand total of 0\n\n\n\n\n\"Eva Pao\" <epao@mba2002.hbs.edu> on 05/10/2001 05:15:59 PM\n\nPlease respond to <epao@mba2002.hbs.edu>\n\nTo: <jarnold@enron.com>\ncc:\nSubject: try this one...\n\n\n Please read the following problem very carefully, and write in a number at\nthe end. You should be ready to defend your answer. Only a number is\nallowed, not an algebraic equation.\n\nAcquiring a Company\n\n In the following exercise you will represent Company A (the acquirer),\nwhich is currently considering acquiring Company T (the target) by means of\na tender offer. You plan to tender in cash for 100% of Company T's shares\nbut are unsure how high a price to offer. The main complication is this:\nthe value of Company T depends directly on the outcome of a major oil\nexploration project it is currently undertaking. Indeed, the very\nviability\nof Company T depends on the exploration outcome. If the project fails, the\ncompany under current management will be worth nothing--$0/share. But if\nthe project succeeds, the value of the company under current management\ncould be as high as $100/share. All share values between $0 and $100 are\nconsidered equally likely. By all estimates, the company will be worth\nconsiderably more in the hands of Company A than under current management.\nIn fact, whatever the ultimate value under current management, the company\nwill be worth fifty percent more under the management of A than under\nCompany T. If the project fails, the company is worth $0/share under\neither\nmanagement. If the exploration project generates a $50/share value under\ncurrent management, the value under Company A is $75/share. Similarly, a\n$100/share value under Company T implies a $150/share value under Company\nA,\nand so on.\n\n The board of directors of Company A has asked you to determine the\nprice\nthey should offer for Company T's shares. This offer must be made now,\nbefore the outcome of the drilling project is known. From all indications,\nCompany T would be happy to be acquired by Company A, provided it is at a\nprofitable price. Moreover, Company T wishes to avoid, at all cost, the\npotential of a takeover bid by any other firm. You expect Company T to\ndelay a decision on your bid until the results of the project are in, then\naccept or reject your offer before the news of the drilling results reaches\nthe press.\n\n Thus, you (Company A) will not know the results of the exploration\nproject\nwhen submitting your price offer, but Company T will know the results when\ndeciding whether or not to accept your offer. In addition, Company T will\naccept any offer by Company A that is greater than the (per share) value of\nthe company under current management. Thus, if you offer $50/share, for\ninstance, Company T will accept if the value of the company to Company T is\nanything less than $50.\n\n As the representative of Company A, you are deliberating over price\noffers\nin the range of $0/share (this is tantamount to making no offer at all) to\n$150/share. What price offer per share would you tender for Company T's\nstock?\n\n $______ per/share\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", "subject": "RE: Extra credit", "date": "2001-05-13 09:16:00-07:00", "from": "john.arnold@enron.com", "to": "epao@mba2002.hbs.edu"}
208
+ {"chunk_id": "att_M-000206_p1", "thread_id": "T-0012", "message_id": "M-000206", "page_no": 1, "source": "attachment", "text": "DRAFT \u2013 NGI access terms for Dexter Steis. Guest login provides read-only access to NGI indices on EOL for the SoCal Border hub. Pricing is indicative only and may change before final approval.", "filename": "NGI_draft_terms.pdf"}
209
+ {"chunk_id": "att_M-000207_p1", "thread_id": "T-0012", "message_id": "M-000207", "page_no": 1, "source": "attachment", "text": "FINAL APPROVAL \u2013 NGI access for Dexter Steis. Enron Trading approves a guest ID for NGI access on EOL. Access covers SoCal Border index data and is intended to support fair index formation.", "filename": "NGI_final_approval.pdf"}
210
+ {"chunk_id": "att_M-000207_p2", "thread_id": "T-0012", "message_id": "M-000207", "page_no": 2, "source": "attachment", "text": "DETAILS \u2013 The guest ID for Dexter Steis is valid through January of the following year, subject to Enron credit approval. If any issues arise, Dexter should contact Phillip Allen or the Enron credit desk.", "filename": "NGI_final_approval.pdf"}
data/embeddings.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:964fe2c43460e9705195e66890557f2bfa561d2ab6156ba00d1b51126cad4064
3
- size 318080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dd7e4d4f20c9645884b00bd1a4acd96726ac3617d57c59da87bb424244756e2
3
+ size 322688
data/fake_attachments.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"thread_id": "T-0012", "message_id": "M-000206", "filename": "NGI_draft_terms.pdf", "page_no": 1, "source": "attachment", "text": "DRAFT – NGI access terms for Dexter Steis. Guest login provides read-only access to NGI indices on EOL for the SoCal Border hub. Pricing is indicative only and may change before final approval."}
2
+ {"thread_id": "T-0012", "message_id": "M-000207", "filename": "NGI_final_approval.pdf", "page_no": 1, "source": "attachment", "text": "FINAL APPROVAL – NGI access for Dexter Steis. Enron Trading approves a guest ID for NGI access on EOL. Access covers SoCal Border index data and is intended to support fair index formation."}
3
+ {"thread_id": "T-0012", "message_id": "M-000207", "filename": "NGI_final_approval.pdf", "page_no": 2, "source": "attachment", "text": "DETAILS – The guest ID for Dexter Steis is valid through January of the following year, subject to Enron credit approval. If any issues arise, Dexter should contact Phillip Allen or the Enron credit desk."}