LPX55 commited on
Commit
20235b3
·
verified ·
1 Parent(s): b79fd97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -752
app.py CHANGED
@@ -1,781 +1,247 @@
1
- import gradio as gr
2
- #import urllib.request
3
- import requests
4
- import bs4
5
- import lxml
6
  import os
7
- #import subprocess
8
- from huggingface_hub import InferenceClient,HfApi
9
  import random
10
- import json
11
- import datetime
12
- from pypdf import PdfReader
13
  import uuid
14
- #from query import tasks
15
- from agent import (
16
- PREFIX,
17
- COMPRESS_DATA_PROMPT,
18
- COMPRESS_DATA_PROMPT_SMALL,
19
- LOG_PROMPT,
20
- LOG_RESPONSE,
21
- )
22
- client = InferenceClient(
23
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
24
- )
25
- reponame="LPX55/ArxivPapers"
26
- save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
27
- token_self = os.environ['HF_TOKEN']
28
- api=HfApi(token=token_self)
29
-
30
- def find_all(url):
31
- return_list=[]
32
- print (url)
33
- print (f"trying URL:: {url}")
34
- try:
35
- if url != "" and url != None:
36
- out = []
37
- source = requests.get(url)
38
- print(source.status_code)
39
- if source.status_code ==200:
40
- print('trying')
41
- soup = bs4.BeautifulSoup(source.content,'lxml')
42
-
43
- rawp=(f'RAW TEXT RETURNED: {soup.text}')
44
- print (rawp)
45
- cnt=0
46
- cnt+=len(rawp)
47
- out.append(rawp)
48
- out.append("HTML fragments: ")
49
- q=("a","p","span","content","article")
50
- for p in soup.find_all("a"):
51
- out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
52
- c=0
53
- out = str(out)
54
- rl = len(out)
55
- print(f'rl:: {rl}')
56
- for i in str(out):
57
- if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
58
- c +=1
59
- print (f'c:: {c}')
60
- #if c > MAX_HISTORY:
61
- #print("compressing...")
62
- #rawp = compress_data(c,purpose,task,out,result)
63
- #result += rawp
64
- rawp=out
65
- return True, rawp
66
- else:
67
-
68
- return False, f'Status:: {source.status_code}'
69
- else:
70
- print('passing')
71
- return False, "Enter Valid URL"
72
- except Exception as e:
73
- print (e)
74
- return False, f'Error: {e}'
75
-
76
-
77
- def read_txt(txt_path):
78
- text=""
79
- with open(txt_path,"r") as f:
80
- text = f.read()
81
- f.close()
82
- print (text)
83
- return text
84
-
85
- def read_pdf(pdf_path):
86
- text=""
87
- reader = PdfReader(f'{pdf_path}')
88
- number_of_pages = len(reader.pages)
89
- for i in range(number_of_pages):
90
- page = reader.pages[i]
91
- text = f'{text}\n{page.extract_text()}'
92
- print (text)
93
- return text
94
-
95
- error_box=[]
96
- def read_pdf_online(url):
97
- uid=uuid.uuid4()
98
- print(f"reading {url}")
99
- response = requests.get(url, stream=True)
100
- print(response.status_code)
101
- text=""
102
- #################
103
-
104
- #####################
105
- try:
106
- if response.status_code == 200:
107
- with open("test.pdf", "wb") as f:
108
- f.write(response.content)
109
- #f.close()
110
- #out = Path("./data.pdf")
111
- #print (out)
112
- reader = PdfReader("test.pdf")
113
- number_of_pages = len(reader.pages)
114
- print(number_of_pages)
115
- for i in range(number_of_pages):
116
- page = reader.pages[i]
117
- text = f'{text}\n{page.extract_text()}'
118
- print(f"PDF_TEXT:: {text}")
119
- return text
120
- else:
121
- text = response.status_code
122
- error_box.append(url)
123
- print(text)
124
- return text
125
-
126
-
127
- except Exception as e:
128
- print (e)
129
- return e
130
-
131
-
132
- VERBOSE = True
133
- MAX_HISTORY = 100
134
- MAX_DATA = 20000
135
-
136
- def format_prompt(message, history):
137
- prompt = "<s>"
138
- for user_prompt, bot_response in history:
139
- prompt += f"[INST] {user_prompt} [/INST]"
140
- prompt += f" {bot_response}</s> "
141
- prompt += f"[INST] {message} [/INST]"
142
- return prompt
143
-
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- def run_gpt(
147
- prompt_template,
148
- stop_tokens,
149
- max_tokens,
150
- seed,
151
- **prompt_kwargs,
152
- ):
153
- print(seed)
154
- timestamp=datetime.datetime.now()
155
-
156
- generate_kwargs = dict(
157
- temperature=0.9,
158
- max_new_tokens=max_tokens,
159
- top_p=0.95,
160
- repetition_penalty=1.0,
161
- do_sample=True,
162
- seed=seed,
163
- )
164
 
165
- content = PREFIX.format(
166
- timestamp=timestamp,
167
- purpose="Compile the provided data and complete the users task"
168
- ) + prompt_template.format(**prompt_kwargs)
169
- if VERBOSE:
170
- print(LOG_PROMPT.format(content))
171
 
 
 
 
 
172
 
173
- #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
174
- #formatted_prompt = format_prompt(f'{content}', history)
175
-
176
- stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
177
- resp = ""
178
- for response in stream:
179
- resp += response.token.text
180
- #yield resp
181
-
182
- if VERBOSE:
183
- print(LOG_RESPONSE.format(resp))
184
- return resp
185
-
186
 
187
- def compress_data(c, instruct, history):
188
- seed=random.randint(1,1000000000)
 
 
189
 
190
- print (c)
191
- #tot=len(purpose)
192
- #print(tot)
193
- divr=int(c)/MAX_DATA
194
- divi=int(divr)+1 if divr != int(divr) else int(divr)
195
- chunk = int(int(c)/divr)
196
- print(f'chunk:: {chunk}')
197
- print(f'divr:: {divr}')
198
- print (f'divi:: {divi}')
199
- out = []
200
- #out=""
201
- s=0
202
- e=chunk
203
- print(f'e:: {e}')
204
- new_history=""
205
- #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
206
- for z in range(divi):
207
- print(f's:e :: {s}:{e}')
208
-
209
- hist = history[s:e]
210
-
211
- resp = run_gpt(
212
- COMPRESS_DATA_PROMPT_SMALL,
213
- stop_tokens=["observation:", "task:", "action:", "thought:"],
214
- max_tokens=8192,
215
- seed=seed,
216
- direction=instruct,
217
- knowledge="",
218
- history=hist,
219
- )
220
- out.append(resp)
221
- #new_history = resp
222
- print (resp)
223
- #out+=resp
224
- e=e+chunk
225
- s=s+chunk
226
- return out
227
 
 
 
 
 
228
 
229
- def compress_data_og(c, instruct, history):
230
- seed=random.randint(1,1000000000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
- print (c)
233
- #tot=len(purpose)
234
- #print(tot)
235
- divr=int(c)/MAX_DATA
236
- divi=int(divr)+1 if divr != int(divr) else int(divr)
237
- chunk = int(int(c)/divr)
238
- print(f'chunk:: {chunk}')
239
- print(f'divr:: {divr}')
240
- print (f'divi:: {divi}')
241
- out = []
242
- #out=""
243
- s=0
244
- e=chunk
245
- print(f'e:: {e}')
246
- new_history=""
247
- #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
248
- for z in range(divi):
249
- print(f's:e :: {s}:{e}')
250
-
251
- hist = history[s:e]
252
-
253
- resp = run_gpt(
254
- COMPRESS_DATA_PROMPT,
255
- stop_tokens=["observation:", "task:", "action:", "thought:"],
256
- max_tokens=8192,
257
- seed=seed,
258
- direction=instruct,
259
- knowledge=new_history,
260
- history=hist,
261
- )
262
-
263
- new_history = resp
264
- print (resp)
265
- out+=resp
266
- e=e+chunk
267
- s=s+chunk
268
- '''
269
- resp = run_gpt(
270
- COMPRESS_DATA_PROMPT,
271
- stop_tokens=["observation:", "task:", "action:", "thought:"],
272
- max_tokens=8192,
273
- seed=seed,
274
- direction=instruct,
275
- knowledge=new_history,
276
- history="All data has been recieved.",
277
- )'''
278
- print ("final" + resp)
279
- #history = "observation: {}\n".format(resp)
280
- return resp
281
-
282
-
283
- RECALL_MEMORY="""The user will give you a query and a list
284
- Your duty is to choose the words from the list that are closely related to the search query.
285
- If there are no relevant keywords found in the provided list return 'NONE'
286
- Respond with only a list, or NONE
287
- Respond only in this format:
288
- [keyword1,keyword2,keyword3]
289
-
290
- USER QUERY:
291
- {prompt}
292
-
293
- KEYWORD LIST:
294
- {keywords}
295
- """
296
-
297
-
298
-
299
-
300
- def get_mem(prompt,kw):
301
- seed=random.randint(1,1000000000)
302
- generate_kwargs = dict(
303
- temperature=0.6,
304
- max_new_tokens=1024,
305
- top_p=0.6,
306
- repetition_penalty=1.0,
307
- do_sample=True,
308
- seed=seed,
309
  )
310
 
311
- content = RECALL_MEMORY.format(keywords=kw,prompt=prompt)
312
-
313
- stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
314
- resp = ""
315
- for response in stream:
316
- resp += response.token.text
317
-
318
- print (resp)
319
- return resp
320
-
321
 
322
- def summarize(inp,history,report_check,sum_check,mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
323
- json_box=[]
324
- if inp == "":
325
- inp = "Process this data"
326
- history.clear()
327
- history = [(inp,"Working on it...")]
328
- yield "",history,error_box,json_box
329
-
330
- if pdf_batch.startswith("http"):
331
- c=0
332
- data=""
333
- for i in str(pdf_batch):
334
- if i==",":
335
- c+=1
336
- print (f'c:: {c}')
337
-
338
- try:
339
- for i in range(c+1):
340
- batch_url = pdf_batch.split(",",c)[i]
341
- bb = read_pdf_online(batch_url)
342
- data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
343
- except Exception as e:
344
- print(e)
345
- #data=f'{data}\nError reading URL ({batch_url})'
346
- if pdf_url.startswith("http"):
347
- print("PDF_URL")
348
- out = read_pdf_online(pdf_url)
349
- data=out
350
- if url.startswith("http"):
351
- val, out = find_all(url)
352
- if not val:
353
- data="Error"
354
- rawp = str(out)
355
- else:
356
- data=out
357
- if files:
358
- for i, file in enumerate(files):
359
- try:
360
- print (file)
361
- if file.endswith(".pdf"):
362
- zz=read_pdf(file)
363
- print (zz)
364
- data=f'{data}\nFile Name ({file}):\n{zz}'
365
- elif file.endswith(".txt"):
366
- zz=read_txt(file)
367
- print (zz)
368
- data=f'{data}\nFile Name ({file}):\n{zz}'
369
- except Exception as e:
370
- data=f'{data}\nError opening File Name ({file})'
371
- print (e)
372
- if data != "Error" and data != "":
373
- print(inp)
374
- out = str(data)
375
- rl = len(out)
376
- print(f'rl:: {rl}')
377
- c=1
378
- for i in str(out):
379
- if i == " " or i=="," or i=="\n":
380
- c +=1
381
- print (f'c:: {c}')
382
- if mem_check:
383
- json_out = save_memory(inp,out)
384
- rawp = "Complete"
385
- if sum_check:
386
- json_out = compress_data(c,inp,out)
387
-
388
- out = str(json_out)
389
- if report_check:
390
- rl = len(out)
391
- print(f'rl:: {rl}')
392
- c=1
393
- for i in str(out):
394
- if i == " " or i=="," or i=="\n":
395
- c +=1
396
- print (f'c2:: {c}')
397
- rawp = compress_data_og(c,inp,out)
398
- else:
399
- rawp = out
400
- json_out = format_json(json_out)
401
- else:
402
- rawp = "Provide a valid data source"
403
- history.clear()
404
- history.append((inp,rawp))
405
- yield "", history,error_box,json_out
406
- SAVE_MEMORY = """
407
- You are attempting to complete the task
408
- task: {task}
409
- Data:
410
- {history}
411
- Instructions:
412
- Compile and categorize the data above into a JSON dictionary string
413
- Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
414
- Required keys:
415
- "keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"],
416
- "title":"title of entry",
417
- "description":"A sentence summarizing the topic of this entry",
418
- "content":"A brief paragraph summarizing the important datapoints found in this entry",
419
- "url":"https://url.source"
420
- """
421
-
422
-
423
-
424
-
425
- def format_json(inp):
426
-
427
- print("FORMATTING:::")
428
- print(type(inp))
429
- print("###########")
430
- print(inp)
431
- print("###########")
432
- print("###########")
433
- new_str=""
434
- matches=["```","#","//"]
435
- for i,line in enumerate(inp):
436
- line = line.strip()
437
- print(line)
438
- #if not any(x in line for x in matches):
439
- new_str+=line.strip("\n").strip("```").strip("#").strip("//")
440
- print("###########")
441
- print("###########")
442
- #inp = inp.strip("<\s>")
443
- new_str=new_str.strip("</s>")
444
- out_json=eval(new_str)
445
- print(out_json)
446
- print("###########")
447
- print("###########")
448
 
449
- return out_json
450
-
451
-
452
-
453
-
454
- def format_json_og(inp):
455
- new_json=[]
456
- start_json={}
457
- print("FORMATTING:::")
458
- for i,line in enumerate(inp):
459
- line = line.strip()
460
- if "{" in line:
461
- print (line)
462
- start_json={}
463
- #print(f'test:: {line}')
464
- if "keywords" in line and ":" in line:
465
- start_json['keywords']=line.split(":")[1].strip(",")
466
- print (line)
467
- if "title" in line and ":" in line:
468
- start_json['title']=line.split(":")[1].strip(",")
469
- print (line)
470
- if "description" in line and ":" in line:
471
- start_json['description']=line.split(":")[1].strip(",")
472
- print (line)
473
- if "content" in line and ":" in line:
474
- start_json['content']=line.split(":")[1].strip(",")
475
- print (line)
476
- if "url" in line and ":" in line:
477
- start_json['url']=line.split(":")[1].strip(",")
478
- print (line)
479
-
480
- if "}" in line:
481
- new_json.append(start_json)
482
- print (new_json)
483
- return new_json
484
-
485
- def create_index():
486
- uid=uuid.uuid4()
487
-
488
- ####### load index ###############
489
- r = requests.get(f'{save_data}mem-test2/index.json')
490
- print(f'status code main:: {r.status_code}')
491
- if r.status_code==200:
492
- ind = json.loads(r.text)
493
- print (f'ind::\n{ind}')
494
- if not r.status_code==200:
495
- print("Create new IND")
496
- ind = [{}]
497
-
498
- ####### load main ###############
499
- m = requests.get(f'{save_data}mem-test2/main.json')
500
- print(f'status code main:: {m.status_code}')
501
-
502
- if m.status_code==200:
503
- main = json.loads(m.text)
504
- #print (f'main::\n{main}')
505
- if not r.status_code==200:
506
- main = []
507
  try:
508
- for ea in main:
509
- #print(f'###### EACH::: {ea}')
510
- print(f"KEYWORDS:: {ea['keywords']}")
 
 
 
 
 
511
  except Exception as e:
512
- print(f"ERROR:: {e}")
513
- for ea in main:
514
- try:
515
- for k in ea['keywords']:
516
- print(k)
517
- print(ea['file_name'])
518
- #for ii in ind[0]:
519
- try:
520
- if k in ind[0].keys():
521
- print("Adding to list")
522
- if not ea['file_name'] in ind[0][k]:
523
- ind[0][k].append(ea['file_name'])
524
- else:
525
- print("Adding new Value")
526
- ind[0].update({k:[ea['file_name']]})
527
- except Exception as e:
528
- print (e)
529
- ind[0].append({k:[ea['file_name']]})
530
- #ind.append({k:[ea['file_name']]})
531
-
532
- except Exception as e:
533
- print (e)
 
 
 
 
534
 
535
- json_object = json.dumps(ind, indent=4)
536
- with open(f"tmp3-{uid}.json", "w") as outfile3:
537
- outfile3.write(json_object)
538
- outfile3.close()
539
- api.upload_file(
540
- path_or_fileobj=f"tmp3-{uid}.json",
541
- path_in_repo=f"/mem-test2/index.json",
542
- repo_id=reponame,
543
- #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
544
- token=token_self,
545
- repo_type="dataset",
546
- )
547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
 
549
 
550
- def save_memory(purpose, history):
551
- uid=uuid.uuid4()
552
- history=str(history)
553
- c=1
554
- inp = str(history)
555
- rl = len(inp)
556
- print(f'rl:: {rl}')
557
- for i in str(inp):
558
- if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
559
- c +=1
560
- print (f'c:: {c}')
561
 
562
- seed=random.randint(1,1000000000)
563
-
564
- print (c)
565
- #tot=len(purpose)
566
- #print(tot)
567
- divr=int(c)/MAX_DATA
568
- divi=int(divr)+1 if divr != int(divr) else int(divr)
569
- chunk = int(int(c)/divr)
570
- print(f'chunk:: {chunk}')
571
- print(f'divr:: {divr}')
572
- print (f'divi:: {divi}')
573
- out_box = []
574
- #out=""
575
- s=0
576
- ee=chunk
577
- print(f'e:: {ee}')
578
- new_history=""
579
- task = f'Index this Data\n'
580
- for z in range(divi):
581
- print(f's:e :: {s}:{ee}')
582
 
583
- hist = inp[s:ee]
 
 
 
 
 
 
 
584
 
585
- resp = run_gpt(
586
- SAVE_MEMORY,
587
- stop_tokens=["observation:", "task:", "action:", "thought:"],
588
- max_tokens=4096,
589
- seed=seed,
590
- purpose=purpose,
591
- task=task,
592
- history=hist,
593
- ).strip('\n')
594
- #new_history = resp
595
- #print (resp)
596
- #out+=resp
597
-
598
- #print ("final1" + resp)
599
- try:
600
- resp='[{'+resp.split('[{')[1].split('</s>')[0]
601
- #print ("final2\n" + resp)
602
- #print(f"keywords:: {resp['keywords']}")
603
- except Exception as e:
604
- resp = resp
605
- print(e)
606
- timestamp=str(datetime.datetime.now())
607
- timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
608
- json_object=resp
609
- #json_object = json.dumps(out_box)
610
- #json_object = json.dumps(out_box,indent=4)
611
- with open(f"tmp-{uid}.json", "w") as outfile:
612
- outfile.write(json_object)
613
-
614
- outfile.close()
615
- api.upload_file(
616
- path_or_fileobj=f"tmp-{uid}.json",
617
- path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
618
- repo_id=reponame,
619
- #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
620
- token=token_self,
621
- repo_type="dataset",
622
- )
623
- lines = resp.strip().strip("\n").split("\n")
624
- #formatted_json=format_json(lines)
625
-
626
- r = requests.get(f'{save_data}mem-test2/main.json')
627
- print(f'status code main:: {r.status_code}')
628
- try:
629
- print(f"KEYWORDS:: {json_object['keywords']}")
630
- except Exception as e:
631
- print(f"KEYWORDS:: {e}")
632
- if r.status_code==200:
633
-
634
- lod = json.loads(r.text)
635
- #lod = eval(lod)
636
- print (f'lod:: {lod}')
637
- if not r.status_code==200:
638
- lod = []
639
- key_box=[]
640
- desc=""
641
- for i,line in enumerate(lines):
642
-
643
- #print(f'LINE:: {line}')
644
- if ":" in line:
645
- print(f'line:: {line}')
646
-
647
- if "keywords" in line and ":" in line:
648
- print(f'trying:: {line}')
649
- keyw=line.split(":")[1]
650
- print (keyw)
651
- try:
652
- print (keyw.split("[")[1].split("]")[0])
653
- keyw=keyw.split("[")[1].split("]")[0]
654
- for ea in keyw.split(","):
655
- s1=""
656
- ea=ea.strip().strip("\n")
657
- for ev in ea:
658
- if ev.isalnum():
659
- s1+=ev
660
- if ev == " ":
661
- s1+=ev
662
- #ea=s1
663
- print(s1)
664
- key_box.append(s1)
665
- except Exception as e:
666
- print(f'ERROR SAVING KEYWORD:: {e}')
667
- if "description" in line and ":" in line:
668
- #print(f'trying:: {line}')
669
- desc=line.split(":")[1]
670
-
671
- if key_box and desc:
672
- lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"description":str(desc),"index":f"{s}:{ee}"})
673
- key_box = []
674
- desc=""
675
- json_object = json.dumps(lod, indent=4)
676
- with open(f"tmp2-{uid}.json", "w") as outfile2:
677
- outfile2.write(json_object)
678
- outfile2.close()
679
- api.upload_file(
680
- path_or_fileobj=f"tmp2-{uid}.json",
681
- path_in_repo=f"/mem-test2/main.json",
682
- repo_id=reponame,
683
- #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
684
- token=token_self,
685
- repo_type="dataset",
686
- )
687
-
688
- ee=ee+chunk
689
- s=s+chunk
690
- out_box.append(resp)
691
- create_index()
692
- return out_box
693
-
694
- def valid_list(inp):
695
- out_list=[]
696
- inp_typ = type(inp)
697
- print(inp_typ)
698
- if inp_typ==type(str(inp)):
699
- print("STRING")
700
- #new_list = new_list.replace(", ",",").replace(" ,",",")
701
- new_list=inp.split("[")[1].split("]",-1)[0]
702
- print(new_list)
703
- print(type(new_list))
704
 
705
- for ea in new_list.split(","):
706
- ea = ea.replace("'","").replace('"',"")
707
- out_list.append(ea)
708
- print(out_list)
709
- print(type(out_list))
710
-
711
-
712
- def recall_memory(inp,history):
713
- error_box=""
714
- json_out={}
715
- if not history:
716
- history=[]
717
- r = requests.get(f'{save_data}mem-test2/index.json')
718
- print(f'status code main:: {r.status_code}')
719
- if r.status_code==200:
720
- mem = json.loads(r.text)
721
- print (f'ind::\n{mem}')
722
- if not r.status_code==200:
723
- print("Create new IND")
724
- out="MEMORY FILE NOT FOUND"
725
- return out,out,out,out
726
- mem_keys = mem[0].keys()
727
- rawp = get_mem(inp,mem_keys)
728
- valid_list(rawp)
729
- #valid_list(["123","333"])
730
-
731
- history.clear()
732
- history.append((inp,rawp))
733
- yield "", history,error_box,json_out
734
-
735
-
736
-
737
- #################################
738
- def clear_fn():
739
- return "",[(None,None)]
740
 
741
- with gr.Blocks() as app:
742
- gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
743
- chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
744
- with gr.Row():
745
- with gr.Column(scale=3):
746
- prompt=gr.Textbox(label = "Instructions (optional)")
747
- with gr.Column(scale=1):
748
- report_check=gr.Checkbox(label="Return Report", value=True)
749
- sum_check=gr.Checkbox(label="Summarize", value=True)
750
- mem_check=gr.Checkbox(label="Memory", value=True)
751
- #sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
752
- button=gr.Button()
753
-
754
- #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
755
- with gr.Row():
756
- stop_button=gr.Button("Stop")
757
- clear_btn = gr.Button("Clear")
758
- with gr.Row():
759
- with gr.Tab("Text"):
760
- data=gr.Textbox(label="Input Data (paste text)", lines=6)
761
- with gr.Tab("File"):
762
- file=gr.Files(label="Input File(s) (.pdf .txt)")
763
- with gr.Tab("Raw HTML"):
764
- url = gr.Textbox(label="URL")
765
- with gr.Tab("PDF URL"):
766
- pdf_url = gr.Textbox(label="PDF URL")
767
- with gr.Tab("PDF Batch"):
768
- pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
769
- with gr.Tab("Memory"):
770
- mem_inp = gr.Textbox(label="Query")
771
- mem = gr.Button()
772
- json_out=gr.JSON()
773
- e_box=gr.Textbox()
774
 
775
- mem.click(recall_memory,mem_inp,[prompt,chatbot,e_box,json_out])
776
- #text=gr.JSON()
777
- #inp_query.change(search_models,inp_query,models_dd)
778
- clear_btn.click(clear_fn,None,[prompt,chatbot])
779
- go=button.click(summarize,[prompt,chatbot,report_check,sum_check,mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
780
- stop_button.click(None,None,None,cancels=[go])
781
- app.queue(default_concurrency_limit=20).launch(show_api=False)
 
1
+ import json
 
 
 
 
2
  import os
 
 
3
  import random
 
 
 
4
  import uuid
5
+ import datetime
6
+ from typing import List, Tuple, Dict, Optional, Generator, Any
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ import gradio as gr
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ from pypdf import PdfReader
12
+ import openai
13
+ from huggingface_hub import HfApi
14
+
15
+ # Configuration
16
+ OPENAI_API_BASE = "https://openrouter.ai/api/v1"
17
+ OPENAI_API_KEY = os.environ.get("OR_KEY", "")
18
+ REPO_NAME = "LPX55/ArxivPapers"
19
+ SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
20
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
21
+ api = HfApi(token=HF_TOKEN)
22
+
23
+ # Initialize OpenAI client
24
+ openai.api_base = OPENAI_API_BASE
25
+ openai.api_key = OPENAI_API_KEY
26
+
27
+
28
+ # Indexing Constants
29
+ INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
30
+ - "keywords": List of important keywords
31
+ - "title": Descriptive title
32
+ - "description": Brief summary
33
+ - "content": Main content
34
+ - "url": Source URL if available
35
+ """
36
 
37
+ def create_index() -> None:
38
+ """Create or update the search index from memory files."""
39
+ uid = uuid.uuid4()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Load existing index
42
+ index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
43
+ r = requests.get(index_url)
44
+ index_data = json.loads(r.text) if r.status_code == 200 else [{}]
 
 
45
 
46
+ # Load main memory data
47
+ main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
48
+ m = requests.get(main_url)
49
+ main_data = json.loads(m.text) if m.status_code == 200 else []
50
 
51
+ # Update index
52
+ for entry in main_data:
53
+ try:
54
+ for keyword in entry.get('keywords', []):
55
+ if keyword in index_data[0]:
56
+ if entry['file_name'] not in index_data[0][keyword]:
57
+ index_data[0][keyword].append(entry['file_name'])
58
+ else:
59
+ index_data[0][keyword] = [entry['file_name']]
60
+ except Exception as e:
61
+ print(f"Indexing error: {e}")
 
 
62
 
63
+ # Save updated index
64
+ index_path = f"tmp-index-{uid}.json"
65
+ with open(index_path, "w") as f:
66
+ json.dump(index_data, f)
67
 
68
+ api.upload_file(
69
+ path_or_fileobj=index_path,
70
+ path_in_repo="/mem-test2/index.json",
71
+ repo_id=REPO_NAME,
72
+ repo_type="dataset",
73
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ def save_memory(purpose: str, content: str) -> List[Dict]:
76
+ """Save processed content to memory with indexing."""
77
+ uid = uuid.uuid4()
78
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
79
 
80
+ # Generate structured data
81
+ prompt = f"{INDEX_PROMPT}\nData to index:\n{content[:5000]}" # Truncate for API limits
82
+ try:
83
+ response = generate_response(prompt, model="anthropic/claude-2")
84
+ structured_data = json.loads(response)
85
+ except Exception as e:
86
+ print(f"Memory processing error: {e}")
87
+ return []
88
+
89
+ # Save to memory files
90
+ memory_entry = {
91
+ **structured_data,
92
+ "file_name": f"{timestamp}--{uid}.json",
93
+ "timestamp": str(datetime.datetime.now())
94
+ }
95
+
96
+ # Update main memory file
97
+ main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
98
+ m = requests.get(main_url)
99
+ main_data = json.loads(m.text) if m.status_code == 200 else []
100
+ main_data.append(memory_entry)
101
+
102
+ main_path = f"tmp-main-{uid}.json"
103
+ with open(main_path, "w") as f:
104
+ json.dump(main_data, f)
105
 
106
+ api.upload_file(
107
+ path_or_fileobj=main_path,
108
+ path_in_repo="/mem-test2/main.json",
109
+ repo_id=REPO_NAME,
110
+ repo_type="dataset",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  )
112
 
113
+ # Update index
114
+ create_index()
 
 
 
 
 
 
 
 
115
 
116
+ return [memory_entry]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ def fetch_url_content(url: str) -> Tuple[bool, str]:
119
+ """Fetch content from a URL and return status and content."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  try:
121
+ if not url:
122
+ return False, "Enter valid URL"
123
+
124
+ response = requests.get(url)
125
+ if response.status_code == 200:
126
+ soup = BeautifulSoup(response.content, "lxml")
127
+ return True, str(soup)
128
+ return False, f"Status: {response.status_code}"
129
  except Exception as e:
130
+ return False, f"Error: {e}"
131
+
132
+ def read_file_content(file_path: str) -> str:
133
+ """Read content from a file (txt or pdf)."""
134
+ if file_path.endswith(".pdf"):
135
+ reader = PdfReader(file_path)
136
+ return "\n".join(page.extract_text() for page in reader.pages)
137
+ elif file_path.endswith(".txt"):
138
+ with open(file_path, "r") as f:
139
+ return f.read()
140
+ return ""
141
+
142
+ def generate_response(prompt: str, model: str = "openai/gpt-3.5-turbo") -> str:
143
+ """Generate response using OpenRouter API."""
144
+ try:
145
+ response = openai.ChatCompletion.create(
146
+ model=model,
147
+ messages=[{"role": "user", "content": prompt}],
148
+ headers={
149
+ "HTTP-Referer": "https://your-site-url.com",
150
+ "X-Title": "Your App Name"
151
+ }
152
+ )
153
+ return response.choices[0].message.content
154
+ except Exception as e:
155
+ return f"Error: {str(e)}"
156
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ def process_pdf_url(pdf_url: str) -> str:
159
+ """Process PDF from URL and extract text."""
160
+ try:
161
+ response = requests.get(pdf_url, stream=True)
162
+ if response.status_code == 200:
163
+ temp_path = f"temp_{uuid.uuid4()}.pdf"
164
+ with open(temp_path, "wb") as f:
165
+ f.write(response.content)
166
+ return read_file_content(temp_path)
167
+ return f"Error: Status {response.status_code}"
168
+ except Exception as e:
169
+ return f"Error: {e}"
170
+
171
+ def summarize(
172
+ inp: str,
173
+ history: List[Tuple[str, str]],
174
+ report_check: bool,
175
+ sum_check: bool,
176
+ mem_check: bool,
177
+ data: str = "",
178
+ file: Optional[str] = None,
179
+ url: str = "",
180
+ pdf_url: str = "",
181
+ model: str = "openai/gpt-3.5-turbo"
182
+ ) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
183
+ """Main summarization function with memory support."""
184
+ history = [(inp, "Processing...")]
185
+ yield "", history, "", {}
186
+
187
+ processed_data = ""
188
+ if pdf_url.startswith("http"):
189
+ processed_data += process_pdf_url(pdf_url)
190
+ if url.startswith("http"):
191
+ success, content = fetch_url_content(url)
192
+ processed_data += content if success else "Error processing URL"
193
+ if file:
194
+ processed_data += f"\nFile: {file}\n{read_file_content(file)}"
195
+ if data:
196
+ processed_data += data
197
+
198
+ if processed_data:
199
+ prompt = f"Summarize this data: {processed_data[:1000]}..."
200
+ summary = generate_response(prompt, model=model)
201
+
202
+ if mem_check:
203
+ memory_entries = save_memory(inp, processed_data)
204
+ summary += "\n\nSaved to memory with keywords: " + ", ".join(memory_entries[0]['keywords'][:5])
205
 
206
+ history = [(inp, summary)]
207
 
208
+ yield "", history, "", json.dumps(memory_entries[0]) if mem_check else {}
 
 
 
 
 
 
 
 
 
 
209
 
210
+ def create_app():
211
+ with gr.Blocks() as app:
212
+ gr.Markdown("## Mixtral 8x7B Summarizer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ with gr.Row():
215
+ with gr.Column(scale=3):
216
+ prompt = gr.Textbox(label="Instruction")
217
+ with gr.Column(scale=1):
218
+ report_check = gr.Checkbox(label="Return report", value=True)
219
+ sum_check = gr.Checkbox(label="Summarize", value=True)
220
+ mem_check = gr.Checkbox(label="Memory", value=True)
221
+ submit_btn = gr.Button("Submit")
222
 
223
+ with gr.Row():
224
+ with gr.Tab("Text"):
225
+ data = gr.Textbox(label="Input text")
226
+ with gr.Tab("File"):
227
+ file = gr.File(label="Upload file")
228
+ with gr.Tab("URL"):
229
+ url = gr.Textbox(label="Website URL")
230
+ with gr.Tab("PDF"):
231
+ pdf_url = gr.Textbox(label="PDF URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ chatbot = gr.Chatbot()
234
+ error_box = gr.Textbox()
235
+ json_output = gr.JSON()
236
+
237
+ submit_btn.click(
238
+ summarize,
239
+ [prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
240
+ [prompt, chatbot, error_box, json_output]
241
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ if __name__ == "__main__":
246
+ app = create_app()
247
+ app.launch()