DeepakJaiz commited on
Commit
d5ced72
·
1 Parent(s): 1f6a2bc

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +202 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import paperqa
3
+ import pickle
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ import requests
7
+ import zipfile
8
+ import io
9
+ import tempfile
10
+ import os
11
+
12
+
13
+ css_style = """
14
+ .gradio-container {
15
+ font-family: "IBM Plex Mono";
16
+ }
17
+ """
18
+
19
+
20
+ def request_pathname(files, data, openai_api_key):
21
+ if files is None:
22
+ return [[]]
23
+ for file in files:
24
+ # make sure we're not duplicating things in the dataset
25
+ if file.name in [x[0] for x in data]:
26
+ continue
27
+ data.append([file.name, None, None])
28
+ return [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key)
29
+
30
+
31
+ def validate_dataset(dataset, openapi):
32
+ docs_ready = dataset.iloc[-1, 0] != ""
33
+ if docs_ready and type(openapi) is str and len(openapi) > 0:
34
+ return "✨Ready✨"
35
+ elif docs_ready:
36
+ return "⚠️Waiting for key⚠️"
37
+ elif type(openapi) is str and len(openapi) > 0:
38
+ return "⚠️Waiting for documents⚠️"
39
+ else:
40
+ return "⚠️Waiting for documents and key⚠️"
41
+
42
+
43
+ def make_stats(docs):
44
+ return [[len(docs.doc_previews), sum([x[0] for x in docs.doc_previews])]]
45
+
46
+
47
+ # , progress=gr.Progress()):
48
+ def do_ask(question, button, openapi, dataset, length, do_marg, k, max_sources, docs):
49
+ passages = ""
50
+ docs_ready = dataset.iloc[-1, 0] != ""
51
+ if button == "✨Ready✨" and type(openapi) is str and len(openapi) > 0 and docs_ready:
52
+ os.environ['OPENAI_API_KEY'] = openapi.strip()
53
+ if docs is None:
54
+ docs = paperqa.Docs()
55
+ # dataset is pandas dataframe
56
+ for _, row in dataset.iterrows():
57
+ try:
58
+ docs.add(row['filepath'], row['citation string'],
59
+ key=row['key'], disable_check=True)
60
+ yield "", "", "", docs, make_stats(docs)
61
+ except Exception as e:
62
+ pass
63
+ else:
64
+ yield "", "", "", docs, [[0, 0]]
65
+ #progress(0, "Building Index...")
66
+ docs._build_faiss_index()
67
+ #progress(0.25, "Querying...")
68
+ for i, result in enumerate(docs.query_gen(question,
69
+ length_prompt=f'use {length:d} words',
70
+ marginal_relevance=do_marg,
71
+ k=k, max_sources=max_sources)):
72
+ #progress(0.25 + 0.1 * i, "Generating Context" + str(i))
73
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
74
+ #progress(1.0, "Done!")
75
+ # format the passages
76
+ for i, (key, passage) in enumerate(result.passages.items()):
77
+ passages += f'Disabled for now'
78
+ yield result.formatted_answer, result.context, passages, docs, make_stats(docs)
79
+
80
+
81
+ def download_repo(gh_repo, data, openai_api_key, pbar=gr.Progress()):
82
+ # download zipped version of repo
83
+ r = requests.get(f'https://api.github.com/repos/{gh_repo}/zipball')
84
+ if r.status_code == 200:
85
+ pbar(1, 'Downloaded')
86
+
87
+ # iterate through files in zip
88
+ with zipfile.ZipFile(io.BytesIO(r.content)) as z:
89
+ for i, f in enumerate(z.namelist()):
90
+ # skip directories
91
+ if f.endswith('/'):
92
+ continue
93
+ # try to read as plaintext (skip binary files)
94
+ try:
95
+ text = z.read(f).decode('utf-8')
96
+ except UnicodeDecodeError:
97
+ continue
98
+ # check if it's bigger than 100kb or smaller than 10 bytes
99
+ if len(text) > 1e5 or len(text) < 10:
100
+ continue
101
+ # have to save to temporary file so we have a path
102
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
103
+ tmp.write(text.encode('utf-8'))
104
+ tmp.flush()
105
+ path = tmp.name
106
+ # strip off the first directory of f
107
+ rel_path = '/'.join(f.split('/')[1:])
108
+ key = os.path.basename(f)
109
+ citation = f'[{rel_path}](https://github.com/{gh_repo}/tree/main/{rel_path})'
110
+ if path in [x[0] for x in data]:
111
+ continue
112
+ data.append([path, citation, key])
113
+ yield [[len(data), 0]], data, data, validate_dataset(pd.DataFrame(data), openai_api_key)
114
+ pbar(int((i+1)/len(z.namelist()) * 99),
115
+ f'Added {f}')
116
+ pbar(100, 'Done')
117
+ else:
118
+ raise ValueError('Unknown Github Repo')
119
+ return data
120
+
121
+
122
+ with gr.Blocks(css=css_style) as demo:
123
+
124
+ docs = gr.State(None)
125
+ data = gr.State([])
126
+ openai_api_key = gr.State('')
127
+
128
+ gr.Markdown(f"""
129
+ # Document Question and Answer (v{paperqa.__version__})
130
+ *By Andrew White ([@andrewwhite01](https://twitter.com/andrewwhite01))*
131
+ This tool will enable asking questions of your uploaded text, PDF documents,
132
+ or scrape github repos.
133
+ It uses OpenAI's GPT models and thus you must enter your API key below. This
134
+ tool is under active development and currently uses many tokens - up to 10,000
135
+ for a single query. That is $0.10-0.20 per query, so please be careful!
136
+ * [PaperQA](https://github.com/whitead/paper-qa) is the code used to build this tool.
137
+ * [langchain](https://github.com/hwchase17/langchain) is the main library this tool utilizes.
138
+ 1. Enter API Key ([What is that?](https://platform.openai.com/account/api-keys))
139
+ 2. Upload your documents
140
+ 3. Ask a questions
141
+ """)
142
+ openai_api_key = gr.Textbox(
143
+ label="OpenAI API Key", placeholder="sk-...", type="password")
144
+ with gr.Tab('File Upload'):
145
+ uploaded_files = gr.File(
146
+ label="Your Documents Upload (PDF or txt)", file_count="multiple", )
147
+ with gr.Tab('Github Repo'):
148
+ gh_repo = gr.Textbox(
149
+ label="Github Repo", placeholder="whitead/paper-qa")
150
+ download = gr.Button("Download Repo")
151
+
152
+ with gr.Accordion("See Docs:", open=False):
153
+ dataset = gr.Dataframe(
154
+ headers=["filepath", "citation string", "key"],
155
+ datatype=["str", "str", "str"],
156
+ col_count=(3, "fixed"),
157
+ interactive=False,
158
+ label="Documents and Citations",
159
+ overflow_row_behaviour='paginate',
160
+ max_rows=5
161
+ )
162
+ buildb = gr.Textbox("⚠️Waiting for documents and key...",
163
+ label="Status", interactive=False, show_label=True,
164
+ max_lines=1)
165
+ stats = gr.Dataframe(headers=['Docs', 'Chunks'],
166
+ datatype=['number', 'number'],
167
+ col_count=(2, "fixed"),
168
+ interactive=False,
169
+ label="Doc Stats")
170
+ openai_api_key.change(validate_dataset, inputs=[
171
+ dataset, openai_api_key], outputs=[buildb])
172
+ dataset.change(validate_dataset, inputs=[
173
+ dataset, openai_api_key], outputs=[buildb])
174
+ uploaded_files.change(request_pathname, inputs=[
175
+ uploaded_files, data, openai_api_key], outputs=[stats, data, dataset, buildb])
176
+ download.click(fn=download_repo, inputs=[
177
+ gh_repo, data, openai_api_key], outputs=[stats, data, dataset, buildb])
178
+ query = gr.Textbox(
179
+ placeholder="Enter your question here...", label="Question")
180
+ with gr.Row():
181
+ length = gr.Slider(25, 200, value=100, step=5,
182
+ label='Words in answer')
183
+ marg = gr.Checkbox(True, label='Max marginal relevance')
184
+ k = gr.Slider(1, 20, value=10, step=1,
185
+ label='Chunks to examine')
186
+ sources = gr.Slider(1, 10, value=5, step=1,
187
+ label='Contexts to include')
188
+
189
+ ask = gr.Button("Ask Question")
190
+ answer = gr.Markdown(label="Answer")
191
+ with gr.Accordion("Context", open=True):
192
+ context = gr.Markdown(label="Context")
193
+
194
+ with gr.Accordion("Raw Text", open=False):
195
+ passages = gr.Markdown(label="Passages")
196
+ ask.click(fn=do_ask, inputs=[query, buildb,
197
+ openai_api_key, dataset,
198
+ length, marg, k, sources,
199
+ docs], outputs=[answer, context, passages, docs, stats])
200
+
201
+ demo.queue(concurrency_count=20)
202
+ demo.launch(show_error=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ paper-qa>=0.0.21
2
+ gradio
3
+ requests
4
+ transformers