file-indexing

Paused

App Files Files Community

LPX55 commited on May 8, 2025

Commit

20235b3

verified ·

1 Parent(s): b79fd97

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -752

app.py CHANGED Viewed

@@ -1,781 +1,247 @@
-import gradio as gr
-#import urllib.request
-import requests
-import bs4
-import lxml
 import os
-#import subprocess
-from huggingface_hub import InferenceClient,HfApi
 import random
-import json
-import datetime
-from pypdf import PdfReader
 import uuid
-#from query import tasks
-from agent import (
-    PREFIX,
-    COMPRESS_DATA_PROMPT,
-    COMPRESS_DATA_PROMPT_SMALL,
-    LOG_PROMPT,
-    LOG_RESPONSE,
-)
-client = InferenceClient(
-    "mistralai/Mixtral-8x7B-Instruct-v0.1"
-)
-reponame="LPX55/ArxivPapers"
-save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
-token_self = os.environ['HF_TOKEN']
-api=HfApi(token=token_self)
-def find_all(url):
-    return_list=[]
-    print (url)
-    print (f"trying URL:: {url}")
-    try:
-        if url != "" and url != None:
-            out = []
-            source = requests.get(url)
-            print(source.status_code)
-            if source.status_code ==200:
-                print('trying')
-                soup = bs4.BeautifulSoup(source.content,'lxml')
-                rawp=(f'RAW TEXT RETURNED: {soup.text}')
-                print (rawp)
-                cnt=0
-                cnt+=len(rawp)
-                out.append(rawp)
-                out.append("HTML fragments: ")
-                q=("a","p","span","content","article")
-                for p in soup.find_all("a"):
-                    out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}])
-                c=0
-                out = str(out)
-                rl = len(out)
-                print(f'rl:: {rl}')
-                for i in str(out):
-                    if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
-                        c +=1
-                print (f'c:: {c}')
-                #if c > MAX_HISTORY:
-                #print("compressing...")
-                #rawp = compress_data(c,purpose,task,out,result)
-                #result += rawp
-                rawp=out
-                return True, rawp
-            else:
-                return False, f'Status:: {source.status_code}'
-        else:
-            print('passing')
-            return False, "Enter Valid URL"
-    except Exception as e:
-        print (e)
-        return False, f'Error: {e}'
-def read_txt(txt_path):
-    text=""
-    with open(txt_path,"r") as f:
-        text = f.read()
-    f.close()
-    print (text)
-    return text
-def read_pdf(pdf_path):
-    text=""
-    reader = PdfReader(f'{pdf_path}')
-    number_of_pages = len(reader.pages)
-    for i in range(number_of_pages):
-        page = reader.pages[i]
-        text = f'{text}\n{page.extract_text()}'
-    print (text)
-    return text
-error_box=[]
-def read_pdf_online(url):
-    uid=uuid.uuid4()
-    print(f"reading {url}")
-    response = requests.get(url, stream=True)
-    print(response.status_code)
-    text=""
-#################
-#####################
-    try:
-        if response.status_code == 200:
-            with open("test.pdf", "wb") as f:
-                f.write(response.content)
-            #f.close()
-            #out = Path("./data.pdf")
-            #print (out)
-            reader = PdfReader("test.pdf")
-            number_of_pages = len(reader.pages)
-            print(number_of_pages)
-            for i in range(number_of_pages):
-                page = reader.pages[i]
-                text = f'{text}\n{page.extract_text()}'
-                print(f"PDF_TEXT:: {text}")
-            return text
-        else:
-            text = response.status_code
-            error_box.append(url)
-            print(text)
-            return text
-    except Exception as e:
-        print (e)
-        return e
-VERBOSE = True
-MAX_HISTORY = 100
-MAX_DATA = 20000
-def format_prompt(message, history):
-  prompt = "<s>"
-  for user_prompt, bot_response in history:
-    prompt += f"[INST] {user_prompt} [/INST]"
-    prompt += f" {bot_response}</s> "
-  prompt += f"[INST] {message} [/INST]"
-  return prompt
-def run_gpt(
-    prompt_template,
-    stop_tokens,
-    max_tokens,
-    seed,
-    **prompt_kwargs,
-):
-    print(seed)
-    timestamp=datetime.datetime.now()
-    generate_kwargs = dict(
-        temperature=0.9,
-        max_new_tokens=max_tokens,
-        top_p=0.95,
-        repetition_penalty=1.0,
-        do_sample=True,
-        seed=seed,
-    )
-    content = PREFIX.format(
-        timestamp=timestamp,
-        purpose="Compile the provided data and complete the users task"
-    ) + prompt_template.format(**prompt_kwargs)
-    if VERBOSE:
-        print(LOG_PROMPT.format(content))
-    #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
-    #formatted_prompt = format_prompt(f'{content}', history)
-    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    resp = ""
-    for response in stream:
-        resp += response.token.text
-        #yield resp
-    if VERBOSE:
-        print(LOG_RESPONSE.format(resp))
-    return resp
-def compress_data(c, instruct, history):
-    seed=random.randint(1,1000000000)
-    print (c)
-    #tot=len(purpose)
-    #print(tot)
-    divr=int(c)/MAX_DATA
-    divi=int(divr)+1 if divr != int(divr) else int(divr)
-    chunk = int(int(c)/divr)
-    print(f'chunk:: {chunk}')
-    print(f'divr:: {divr}')
-    print (f'divi:: {divi}')
-    out = []
-    #out=""
-    s=0
-    e=chunk
-    print(f'e:: {e}')
-    new_history=""
-    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
-    for z in range(divi):
-        print(f's:e :: {s}:{e}')
-        hist = history[s:e]
-        resp = run_gpt(
-            COMPRESS_DATA_PROMPT_SMALL,
-            stop_tokens=["observation:", "task:", "action:", "thought:"],
-            max_tokens=8192,
-            seed=seed,
-            direction=instruct,
-            knowledge="",
-            history=hist,
-        )
-        out.append(resp)
-        #new_history = resp
-        print (resp)
-        #out+=resp
-        e=e+chunk
-        s=s+chunk
-    return out
-def compress_data_og(c, instruct, history):
-    seed=random.randint(1,1000000000)
-    print (c)
-    #tot=len(purpose)
-    #print(tot)
-    divr=int(c)/MAX_DATA
-    divi=int(divr)+1 if divr != int(divr) else int(divr)
-    chunk = int(int(c)/divr)
-    print(f'chunk:: {chunk}')
-    print(f'divr:: {divr}')
-    print (f'divi:: {divi}')
-    out = []
-    #out=""
-    s=0
-    e=chunk
-    print(f'e:: {e}')
-    new_history=""
-    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
-    for z in range(divi):
-        print(f's:e :: {s}:{e}')
-        hist = history[s:e]
-        resp = run_gpt(
-            COMPRESS_DATA_PROMPT,
-            stop_tokens=["observation:", "task:", "action:", "thought:"],
-            max_tokens=8192,
-            seed=seed,
-            direction=instruct,
-            knowledge=new_history,
-            history=hist,
-        )
-        new_history = resp
-        print (resp)
-        out+=resp
-        e=e+chunk
-        s=s+chunk
-    '''
-    resp = run_gpt(
-        COMPRESS_DATA_PROMPT,
-        stop_tokens=["observation:", "task:", "action:", "thought:"],
-        max_tokens=8192,
-        seed=seed,
-        direction=instruct,
-        knowledge=new_history,
-        history="All data has been recieved.",
-    )'''
-    print ("final" + resp)
-    #history = "observation: {}\n".format(resp)
-    return resp
-RECALL_MEMORY="""The user will give you a query and a list
-Your duty is to choose the words from the list that are closely related to the search query.
-If there are no relevant keywords found in the provided list return 'NONE'
-Respond with only a list, or NONE
-Respond only in this format:
-[keyword1,keyword2,keyword3]
-USER QUERY:
-{prompt}
-KEYWORD LIST:
-{keywords}
-"""
-def get_mem(prompt,kw):
-    seed=random.randint(1,1000000000)
-    generate_kwargs = dict(
-        temperature=0.6,
-        max_new_tokens=1024,
-        top_p=0.6,
-        repetition_penalty=1.0,
-        do_sample=True,
-        seed=seed,
     )
-    content = RECALL_MEMORY.format(keywords=kw,prompt=prompt)
-    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    resp = ""
-    for response in stream:
-        resp += response.token.text
-    print (resp)
-    return resp
-def summarize(inp,history,report_check,sum_check,mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
-    json_box=[]
-    if inp == "":
-        inp = "Process this data"
-    history.clear()
-    history = [(inp,"Working on it...")]
-    yield "",history,error_box,json_box
-    if pdf_batch.startswith("http"):
-        c=0
-        data=""
-        for i in str(pdf_batch):
-            if i==",":
-                c+=1
-        print (f'c:: {c}')
-        try:
-            for i in range(c+1):
-                batch_url = pdf_batch.split(",",c)[i]
-                bb = read_pdf_online(batch_url)
-                data=f'{data}\nFile Name URL ({batch_url}):\n{bb}'
-        except Exception as e:
-            print(e)
-            #data=f'{data}\nError reading URL ({batch_url})'
-    if pdf_url.startswith("http"):
-        print("PDF_URL")
-        out = read_pdf_online(pdf_url)
-        data=out
-    if url.startswith("http"):
-        val, out = find_all(url)
-        if not val:
-            data="Error"
-            rawp = str(out)
-        else:
-            data=out
-    if files:
-        for i, file in enumerate(files):
-            try:
-                print (file)
-                if file.endswith(".pdf"):
-                    zz=read_pdf(file)
-                    print (zz)
-                    data=f'{data}\nFile Name ({file}):\n{zz}'
-                elif file.endswith(".txt"):
-                    zz=read_txt(file)
-                    print (zz)
-                    data=f'{data}\nFile Name ({file}):\n{zz}'
-            except Exception as e:
-                data=f'{data}\nError opening File Name ({file})'
-                print (e)
-    if data != "Error" and data != "":
-        print(inp)
-        out = str(data)
-        rl = len(out)
-        print(f'rl:: {rl}')
-        c=1
-        for i in str(out):
-            if i == " " or i=="," or i=="\n":
-                c +=1
-        print (f'c:: {c}')
-        if mem_check:
-            json_out = save_memory(inp,out)
-            rawp = "Complete"
-        if sum_check:
-            json_out = compress_data(c,inp,out)
-            out = str(json_out)
-            if report_check:
-                rl = len(out)
-                print(f'rl:: {rl}')
-                c=1
-                for i in str(out):
-                    if i == " " or i=="," or i=="\n":
-                        c +=1
-                print (f'c2:: {c}')
-                rawp = compress_data_og(c,inp,out)
-            else:
-                rawp = out
-        json_out = format_json(json_out)
-    else:
-        rawp = "Provide a valid data source"
-    history.clear()
-    history.append((inp,rawp))
-    yield "", history,error_box,json_out
-SAVE_MEMORY = """
-You are attempting to complete the task
-task: {task}
-Data:
-{history}
-Instructions:
-Compile and categorize the data above into a JSON dictionary string
-Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
-Required keys:
-"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"],
-"title":"title of entry",
-"description":"A sentence summarizing the topic of this entry",
-"content":"A brief paragraph summarizing the important datapoints found in this entry",
-"url":"https://url.source"
-"""
-def format_json(inp):
-    print("FORMATTING:::")
-    print(type(inp))
-    print("###########")
-    print(inp)
-    print("###########")
-    print("###########")
-    new_str=""
-    matches=["```","#","//"]
-    for i,line in enumerate(inp):
-        line = line.strip()
-        print(line)
-        #if not any(x in line for x in matches):
-        new_str+=line.strip("\n").strip("```").strip("#").strip("//")
-    print("###########")
-    print("###########")
-    #inp = inp.strip("<\s>")
-    new_str=new_str.strip("</s>")
-    out_json=eval(new_str)
-    print(out_json)
-    print("###########")
-    print("###########")
-    return out_json
-def format_json_og(inp):
-    new_json=[]
-    start_json={}
-    print("FORMATTING:::")
-    for i,line in enumerate(inp):
-        line = line.strip()
-        if "{" in line:
-            print (line)
-            start_json={}
-        #print(f'test:: {line}')
-        if "keywords" in line and ":" in line:
-            start_json['keywords']=line.split(":")[1].strip(",")
-            print (line)
-        if "title" in line and ":" in line:
-            start_json['title']=line.split(":")[1].strip(",")
-            print (line)
-        if "description" in line and ":" in line:
-            start_json['description']=line.split(":")[1].strip(",")
-            print (line)
-        if "content" in line and ":" in line:
-            start_json['content']=line.split(":")[1].strip(",")
-            print (line)
-        if "url" in line and ":" in line:
-            start_json['url']=line.split(":")[1].strip(",")
-            print (line)
-        if "}" in line:
-            new_json.append(start_json)
-            print (new_json)
-    return new_json
-def create_index():
-    uid=uuid.uuid4()
-    ####### load index ###############
-    r = requests.get(f'{save_data}mem-test2/index.json')
-    print(f'status code main:: {r.status_code}')
-    if r.status_code==200:
-        ind = json.loads(r.text)
-        print (f'ind::\n{ind}')
-    if not r.status_code==200:
-        print("Create new IND")
-        ind = [{}]
-    ####### load main ###############
-    m = requests.get(f'{save_data}mem-test2/main.json')
-    print(f'status code main:: {m.status_code}')
-    if m.status_code==200:
-        main = json.loads(m.text)
-        #print (f'main::\n{main}')
-    if not r.status_code==200:
-        main = []
     try:
-        for ea in main:
-            #print(f'###### EACH::: {ea}')
-            print(f"KEYWORDS:: {ea['keywords']}")
     except Exception as e:
-        print(f"ERROR:: {e}")
-    for ea in main:
-        try:
-            for k in ea['keywords']:
-                print(k)
-                print(ea['file_name'])
-                #for ii in ind[0]:
-                try:
-                    if k in ind[0].keys():
-                        print("Adding to list")
-                        if not ea['file_name'] in ind[0][k]:
-                            ind[0][k].append(ea['file_name'])
-                    else:
-                        print("Adding new Value")
-                        ind[0].update({k:[ea['file_name']]})
-                except Exception as e:
-                    print (e)
-                    ind[0].append({k:[ea['file_name']]})
-                    #ind.append({k:[ea['file_name']]})
-        except Exception as e:
-            print (e)
-    json_object = json.dumps(ind, indent=4)
-    with open(f"tmp3-{uid}.json", "w") as outfile3:
-        outfile3.write(json_object)
-    outfile3.close()
-    api.upload_file(
-    path_or_fileobj=f"tmp3-{uid}.json",
-    path_in_repo=f"/mem-test2/index.json",
-    repo_id=reponame,
-    #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
-    token=token_self,
-    repo_type="dataset",
-    )
-def save_memory(purpose, history):
-    uid=uuid.uuid4()
-    history=str(history)
-    c=1
-    inp = str(history)
-    rl = len(inp)
-    print(f'rl:: {rl}')
-    for i in str(inp):
-        if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
-            c +=1
-    print (f'c:: {c}')
-    seed=random.randint(1,1000000000)
-    print (c)
-    #tot=len(purpose)
-    #print(tot)
-    divr=int(c)/MAX_DATA
-    divi=int(divr)+1 if divr != int(divr) else int(divr)
-    chunk = int(int(c)/divr)
-    print(f'chunk:: {chunk}')
-    print(f'divr:: {divr}')
-    print (f'divi:: {divi}')
-    out_box = []
-    #out=""
-    s=0
-    ee=chunk
-    print(f'e:: {ee}')
-    new_history=""
-    task = f'Index this Data\n'
-    for z in range(divi):
-        print(f's:e :: {s}:{ee}')
-        hist = inp[s:ee]
-        resp = run_gpt(
-            SAVE_MEMORY,
-            stop_tokens=["observation:", "task:", "action:", "thought:"],
-            max_tokens=4096,
-            seed=seed,
-            purpose=purpose,
-            task=task,
-            history=hist,
-        ).strip('\n')
-        #new_history = resp
-        #print (resp)
-        #out+=resp
-        #print ("final1" + resp)
-        try:
-            resp='[{'+resp.split('[{')[1].split('</s>')[0]
-            #print ("final2\n" + resp)
-            #print(f"keywords:: {resp['keywords']}")
-        except Exception as e:
-            resp = resp
-            print(e)
-        timestamp=str(datetime.datetime.now())
-        timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
-        json_object=resp
-        #json_object = json.dumps(out_box)
-        #json_object = json.dumps(out_box,indent=4)
-        with open(f"tmp-{uid}.json", "w") as outfile:
-            outfile.write(json_object)
-        outfile.close()
-        api.upload_file(
-        path_or_fileobj=f"tmp-{uid}.json",
-        path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
-        repo_id=reponame,
-        #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
-        token=token_self,
-        repo_type="dataset",
-        )
-        lines = resp.strip().strip("\n").split("\n")
-        #formatted_json=format_json(lines)
-        r = requests.get(f'{save_data}mem-test2/main.json')
-        print(f'status code main:: {r.status_code}')
-        try:
-            print(f"KEYWORDS:: {json_object['keywords']}")
-        except Exception as e:
-            print(f"KEYWORDS:: {e}")
-        if r.status_code==200:
-            lod = json.loads(r.text)
-            #lod = eval(lod)
-            print (f'lod:: {lod}')
-        if not r.status_code==200:
-            lod = []
-        key_box=[]
-        desc=""
-        for i,line in enumerate(lines):
-            #print(f'LINE:: {line}')
-            if ":" in line:
-                print(f'line:: {line}')
-            if "keywords" in line and ":" in line:
-                print(f'trying:: {line}')
-                keyw=line.split(":")[1]
-                print (keyw)
-                try:
-                    print (keyw.split("[")[1].split("]")[0])
-                    keyw=keyw.split("[")[1].split("]")[0]
-                    for ea in keyw.split(","):
-                        s1=""
-                        ea=ea.strip().strip("\n")
-                        for ev in ea:
-                            if ev.isalnum():
-                                s1+=ev
-                            if ev == " ":
-                                s1+=ev
-                            #ea=s1
-                        print(s1)
-                        key_box.append(s1)
-                except Exception as e:
-                    print(f'ERROR SAVING KEYWORD:: {e}')
-            if "description" in line and ":" in line:
-                #print(f'trying:: {line}')
-                desc=line.split(":")[1]
-            if key_box and desc:
-                lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"description":str(desc),"index":f"{s}:{ee}"})
-                key_box = []
-                desc=""
-                json_object = json.dumps(lod, indent=4)
-                with open(f"tmp2-{uid}.json", "w") as outfile2:
-                    outfile2.write(json_object)
-                outfile2.close()
-                api.upload_file(
-                path_or_fileobj=f"tmp2-{uid}.json",
-                path_in_repo=f"/mem-test2/main.json",
-                repo_id=reponame,
-                #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
-                token=token_self,
-                repo_type="dataset",
-                )
-        ee=ee+chunk
-        s=s+chunk
-        out_box.append(resp)
-        create_index()
-    return out_box
-def valid_list(inp):
-    out_list=[]
-    inp_typ = type(inp)
-    print(inp_typ)
-    if inp_typ==type(str(inp)):
-        print("STRING")
-        #new_list = new_list.replace(", ",",").replace(" ,",",")
-        new_list=inp.split("[")[1].split("]",-1)[0]
-        print(new_list)
-        print(type(new_list))
-        for ea in new_list.split(","):
-            ea = ea.replace("'","").replace('"',"")
-            out_list.append(ea)
-        print(out_list)
-        print(type(out_list))
-def recall_memory(inp,history):
-    error_box=""
-    json_out={}
-    if not history:
-        history=[]
-    r = requests.get(f'{save_data}mem-test2/index.json')
-    print(f'status code main:: {r.status_code}')
-    if r.status_code==200:
-        mem = json.loads(r.text)
-        print (f'ind::\n{mem}')
-    if not r.status_code==200:
-        print("Create new IND")
-        out="MEMORY FILE NOT FOUND"
-        return out,out,out,out
-    mem_keys = mem[0].keys()
-    rawp = get_mem(inp,mem_keys)
-    valid_list(rawp)
-    #valid_list(["123","333"])
-    history.clear()
-    history.append((inp,rawp))
-    yield "", history,error_box,json_out
-#################################
-def clear_fn():
-    return "",[(None,None)]
-with gr.Blocks() as app:
-    gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""")
-    chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True)
-    with gr.Row():
-        with gr.Column(scale=3):
-            prompt=gr.Textbox(label = "Instructions (optional)")
-        with gr.Column(scale=1):
-            report_check=gr.Checkbox(label="Return Report", value=True)
-            sum_check=gr.Checkbox(label="Summarize", value=True)
-            mem_check=gr.Checkbox(label="Memory", value=True)
-            #sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
-            button=gr.Button()
-        #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
-    with gr.Row():
-        stop_button=gr.Button("Stop")
-        clear_btn = gr.Button("Clear")
-    with gr.Row():
-        with gr.Tab("Text"):
-            data=gr.Textbox(label="Input Data (paste text)", lines=6)
-        with gr.Tab("File"):
-            file=gr.Files(label="Input File(s) (.pdf .txt)")
-        with gr.Tab("Raw HTML"):
-            url = gr.Textbox(label="URL")
-        with gr.Tab("PDF URL"):
-            pdf_url = gr.Textbox(label="PDF URL")
-        with gr.Tab("PDF Batch"):
-            pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)")
-        with gr.Tab("Memory"):
-            mem_inp = gr.Textbox(label="Query")
-            mem = gr.Button()
-    json_out=gr.JSON()
-    e_box=gr.Textbox()
-    mem.click(recall_memory,mem_inp,[prompt,chatbot,e_box,json_out])
-    #text=gr.JSON()
-    #inp_query.change(search_models,inp_query,models_dd)
-    clear_btn.click(clear_fn,None,[prompt,chatbot])
-    go=button.click(summarize,[prompt,chatbot,report_check,sum_check,mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
-    stop_button.click(None,None,None,cancels=[go])
-app.queue(default_concurrency_limit=20).launch(show_api=False)

+import json
 import os
 import random
 import uuid
+import datetime
+from typing import List, Tuple, Dict, Optional, Generator, Any
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from pypdf import PdfReader
+import openai
+from huggingface_hub import HfApi
+# Configuration
+OPENAI_API_BASE = "https://openrouter.ai/api/v1"
+OPENAI_API_KEY = os.environ.get("OR_KEY", "")
+REPO_NAME = "LPX55/ArxivPapers"
+SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+api = HfApi(token=HF_TOKEN)
+# Initialize OpenAI client
+openai.api_base = OPENAI_API_BASE
+openai.api_key = OPENAI_API_KEY
+# Indexing Constants
+INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
+- "keywords": List of important keywords
+- "title": Descriptive title
+- "description": Brief summary
+- "content": Main content
+- "url": Source URL if available
+"""
+def create_index() -> None:
+    """Create or update the search index from memory files."""
+    uid = uuid.uuid4()
+    # Load existing index
+    index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
+    r = requests.get(index_url)
+    index_data = json.loads(r.text) if r.status_code == 200 else [{}]
+    # Load main memory data
+    main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
+    m = requests.get(main_url)
+    main_data = json.loads(m.text) if m.status_code == 200 else []
+    # Update index
+    for entry in main_data:
+        try:
+            for keyword in entry.get('keywords', []):
+                if keyword in index_data[0]:
+                    if entry['file_name'] not in index_data[0][keyword]:
+                        index_data[0][keyword].append(entry['file_name'])
+                else:
+                    index_data[0][keyword] = [entry['file_name']]
+        except Exception as e:
+            print(f"Indexing error: {e}")
+    # Save updated index
+    index_path = f"tmp-index-{uid}.json"
+    with open(index_path, "w") as f:
+        json.dump(index_data, f)
+    api.upload_file(
+        path_or_fileobj=index_path,
+        path_in_repo="/mem-test2/index.json",
+        repo_id=REPO_NAME,
+        repo_type="dataset",
+    )
+def save_memory(purpose: str, content: str) -> List[Dict]:
+    """Save processed content to memory with indexing."""
+    uid = uuid.uuid4()
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
+    # Generate structured data
+    prompt = f"{INDEX_PROMPT}\nData to index:\n{content[:5000]}"  # Truncate for API limits
+    try:
+        response = generate_response(prompt, model="anthropic/claude-2")
+        structured_data = json.loads(response)
+    except Exception as e:
+        print(f"Memory processing error: {e}")
+        return []
+    # Save to memory files
+    memory_entry = {
+        **structured_data,
+        "file_name": f"{timestamp}--{uid}.json",
+        "timestamp": str(datetime.datetime.now())
+    }
+    # Update main memory file
+    main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
+    m = requests.get(main_url)
+    main_data = json.loads(m.text) if m.status_code == 200 else []
+    main_data.append(memory_entry)
+    main_path = f"tmp-main-{uid}.json"
+    with open(main_path, "w") as f:
+        json.dump(main_data, f)
+    api.upload_file(
+        path_or_fileobj=main_path,
+        path_in_repo="/mem-test2/main.json",
+        repo_id=REPO_NAME,
+        repo_type="dataset",
     )
+    # Update index
+    create_index()
+    return [memory_entry]
+def fetch_url_content(url: str) -> Tuple[bool, str]:
+    """Fetch content from a URL and return status and content."""
     try:
+        if not url:
+            return False, "Enter valid URL"
+        response = requests.get(url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, "lxml")
+            return True, str(soup)
+        return False, f"Status: {response.status_code}"
     except Exception as e:
+        return False, f"Error: {e}"
+def read_file_content(file_path: str) -> str:
+    """Read content from a file (txt or pdf)."""
+    if file_path.endswith(".pdf"):
+        reader = PdfReader(file_path)
+        return "\n".join(page.extract_text() for page in reader.pages)
+    elif file_path.endswith(".txt"):
+        with open(file_path, "r") as f:
+            return f.read()
+    return ""
+def generate_response(prompt: str, model: str = "openai/gpt-3.5-turbo") -> str:
+    """Generate response using OpenRouter API."""
+    try:
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            headers={
+                "HTTP-Referer": "https://your-site-url.com",
+                "X-Title": "Your App Name"
+            }
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"Error: {str(e)}"
+def process_pdf_url(pdf_url: str) -> str:
+    """Process PDF from URL and extract text."""
+    try:
+        response = requests.get(pdf_url, stream=True)
+        if response.status_code == 200:
+            temp_path = f"temp_{uuid.uuid4()}.pdf"
+            with open(temp_path, "wb") as f:
+                f.write(response.content)
+            return read_file_content(temp_path)
+        return f"Error: Status {response.status_code}"
+    except Exception as e:
+        return f"Error: {e}"
+def summarize(
+    inp: str,
+    history: List[Tuple[str, str]],
+    report_check: bool,
+    sum_check: bool,
+    mem_check: bool,
+    data: str = "",
+    file: Optional[str] = None,
+    url: str = "",
+    pdf_url: str = "",
+    model: str = "openai/gpt-3.5-turbo"
+) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
+    """Main summarization function with memory support."""
+    history = [(inp, "Processing...")]
+    yield "", history, "", {}
+    processed_data = ""
+    if pdf_url.startswith("http"):
+        processed_data += process_pdf_url(pdf_url)
+    if url.startswith("http"):
+        success, content = fetch_url_content(url)
+        processed_data += content if success else "Error processing URL"
+    if file:
+        processed_data += f"\nFile: {file}\n{read_file_content(file)}"
+    if data:
+        processed_data += data
+    if processed_data:
+        prompt = f"Summarize this data: {processed_data[:1000]}..."
+        summary = generate_response(prompt, model=model)
+        if mem_check:
+            memory_entries = save_memory(inp, processed_data)
+            summary += "\n\nSaved to memory with keywords: " + ", ".join(memory_entries[0]['keywords'][:5])
+        history = [(inp, summary)]
+    yield "", history, "", json.dumps(memory_entries[0]) if mem_check else {}
+def create_app():
+    with gr.Blocks() as app:
+        gr.Markdown("## Mixtral 8x7B Summarizer")
+        with gr.Row():
+            with gr.Column(scale=3):
+                prompt = gr.Textbox(label="Instruction")
+            with gr.Column(scale=1):
+                report_check = gr.Checkbox(label="Return report", value=True)
+                sum_check = gr.Checkbox(label="Summarize", value=True)
+                mem_check = gr.Checkbox(label="Memory", value=True)
+                submit_btn = gr.Button("Submit")
+        with gr.Row():
+            with gr.Tab("Text"):
+                data = gr.Textbox(label="Input text")
+            with gr.Tab("File"):
+                file = gr.File(label="Upload file")
+            with gr.Tab("URL"):
+                url = gr.Textbox(label="Website URL")
+            with gr.Tab("PDF"):
+                pdf_url = gr.Textbox(label="PDF URL")
+        chatbot = gr.Chatbot()
+        error_box = gr.Textbox()
+        json_output = gr.JSON()
+        submit_btn.click(
+            summarize,
+            [prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
+            [prompt, chatbot, error_box, json_output]
+        )
+    return app
+if __name__ == "__main__":
+    app = create_app()
+    app.launch()