paolosandejas-stratpoint commited on
Commit
9c78fbf
·
verified ·
1 Parent(s): 836ffb8

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -9
  2. app.py +230 -0
  3. requirements.txt +175 -0
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: LLM Research Helper
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.28.3
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: LLM_Research_Helper
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.26.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+
3
+ import os
4
+ import re
5
+ import argparse
6
+
7
+ import requests
8
+
9
+ import google.generativeai as genai
10
+
11
+ from IPython.display import Markdown
12
+
13
+ import gradio as gr
14
+
15
+ # # Used to securely store your API key
16
+ # from google.colab import userdata
17
+
18
+ gemini_api_key = os.environ.get('GEMINI_API_KEY', '-1')
19
+ genai.configure(api_key=gemini_api_key)
20
+
21
+
22
+ S2_API_KEY = os.getenv('S2_API_KEY')
23
+ initial_result_limit = 10
24
+ final_result_limit = 5
25
+
26
+ # Select relevant fields to pull
27
+ fields = 'title,url,abstract,citationCount,journal,isOpenAccess,fieldsOfStudy,year,journal'
28
+
29
+
30
+ def raw_to_markdown(text):
31
+ text = text.replace('•', ' *')
32
+ return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
33
+
34
+
35
+ def markdown_to_raw(markdown_text):
36
+ """
37
+ This function converts basic markdown text to raw text.
38
+
39
+ Args:
40
+ markdown_text: The markdown text string to be converted.
41
+
42
+ Returns:
43
+ A string containing the raw text equivalent of the markdown text.
44
+ """
45
+ # Remove headers
46
+ text = re.sub(r'#+ ?', '', markdown_text)
47
+
48
+ # Remove bold and italics (can be adjusted based on needs)
49
+ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Bold
50
+ text = re.sub(r'_(.+?)_', r'\1', text) # Italics
51
+
52
+ # Remove code blocks
53
+ text = re.sub(r'`(.*?)`', '', text, flags=re.DOTALL)
54
+
55
+ # Remove lists
56
+ text = re.sub(r'\*+ (.*?)$', r'\1', text, flags=re.MULTILINE) # Unordered lists
57
+ text.strip() # Remove extra whitespace
58
+
59
+ return text
60
+
61
+
62
+ def find_basis_papers(query):
63
+ papers = None
64
+ if not query:
65
+ print('No query given')
66
+ return None
67
+
68
+ rsp = requests.get('https://api.semanticscholar.org/graph/v1/paper/search',
69
+ headers={'X-API-KEY': S2_API_KEY},
70
+ params={'query': query, 'limit': initial_result_limit, 'fields': fields})
71
+ rsp.raise_for_status()
72
+ results = rsp.json()
73
+ total = results["total"]
74
+ if not total:
75
+ print('No matches found. Please try another query.')
76
+ return None
77
+
78
+ print(f'Found {total} initial results. Showing up to {initial_result_limit}.')
79
+ papers = results['data']
80
+ # print("INITIAL RESULTS")
81
+ # print_papers(papers)
82
+
83
+ # Filter paper results
84
+ filtered_papers = list(filter(isValidPaper, papers))
85
+
86
+ # print("FILTERED RESULTS")
87
+ # print_papers(filtered_papers)
88
+
89
+ # rank paper results
90
+ ranked_papers = sorted(filtered_papers, key=lambda x: (x['year'], x['citationCount']), reverse=True)
91
+
92
+ # print("RANKED RESULTS")
93
+ # print_papers(ranked_papers)
94
+
95
+ # return 5 best papers
96
+ return ranked_papers[0:5]
97
+
98
+
99
+ # def print_papers(papers):
100
+ # for idx, paper in enumerate(papers):
101
+ # print(f"PAPER {idx}")
102
+ # for key, value in paper.items():
103
+ # if key != 'abstract':
104
+ # print(f"\t{key}: '{value}'")
105
+
106
+
107
+ def isValidPaper(paper):
108
+ if paper['isOpenAccess'] and paper['abstract']:
109
+ return True
110
+ else:
111
+ return False
112
+
113
+
114
+ # def filter_papers(papers):
115
+ # filtered_papers = []
116
+ # for paper in papers:
117
+ # if paper['isOpenAccess'] and paper['abstract']:
118
+ # # paper is acceptable
119
+ # filtered_papers.append(paper)
120
+ # return filtered_papers
121
+
122
+
123
+ def GEMINI_optimize_query(initial_query: str):
124
+ # initialize gemini LLM
125
+ model = genai.GenerativeModel('gemini-pro')
126
+ chat = model.start_chat(history=[])
127
+
128
+ prompt = f"""Given a search query, return an optimized version of the query to find related academic papers
129
+ QUERY: {initial_query}.
130
+ Only return the optimized query"""
131
+
132
+ response = chat.send_message(prompt)
133
+ optimized_query = markdown_to_raw(response.text)
134
+
135
+ return optimized_query
136
+
137
+
138
+ def GEMINI_summarize_abstracts(initial_query: str, papers: str):
139
+ # initialize gemini LLM
140
+ model = genai.GenerativeModel('gemini-pro')
141
+ chat = model.start_chat(history=[])
142
+
143
+ prompt = f"""Given the following academic papers,
144
+ return a review of related literature for the search query: {initial_query}.
145
+ Ignore papers without abstracts.
146
+ Here are the papers {papers}
147
+ """
148
+
149
+ response = chat.send_message(prompt)
150
+ abstract_summary = markdown_to_raw(response.text)
151
+
152
+ return abstract_summary
153
+
154
+
155
+ def create_gemini_model():
156
+ # initialize gemini LLM
157
+ model = genai.GenerativeModel('gemini-pro')
158
+ chat = model.start_chat(history=[])
159
+
160
+ return model, chat
161
+
162
+
163
+ # instantiate models
164
+ summarizer_model, summarizer_chat = create_gemini_model()
165
+ query_optimizer_model, query_optimizer_chat = create_gemini_model()
166
+
167
+
168
+ # def get_paper_links(papers):
169
+ # urls = []
170
+ # for paper in papers:
171
+ # urls = paper['url']
172
+
173
+ # return urls
174
+
175
+
176
+ def predict(message, history):
177
+ if history == []:
178
+ query = message
179
+ print(f"INITIAL QUERY: {query}")
180
+ if optimize_query:
181
+ optimizer_prompt = f"""Given a search query, return an optimized
182
+ version of the query to find related academic papers
183
+ QUERY: {query}.
184
+ Only return the optimized query"""
185
+
186
+ response = query_optimizer_chat.send_message(optimizer_prompt)
187
+ query = markdown_to_raw(response.text)
188
+ print(f"OPTIMIZED QUERY: {query}")
189
+
190
+ # optimized query used to search semantic scholar
191
+ papers = find_basis_papers(query)
192
+
193
+ summarizer_prompt = f"""Given the following academic papers,
194
+ return a review of related literature for the search query: {query}.
195
+ Focus on data/key factors and methodologies considered.
196
+ Here are the papers {papers}
197
+ Include the paper urls at the end of the review of related literature.
198
+ """
199
+
200
+ response = summarizer_chat.send_message(summarizer_prompt)
201
+ abstract_summary = markdown_to_raw(response.text)
202
+
203
+ return abstract_summary
204
+
205
+ response = summarizer_chat.send_message(message)
206
+ response_text = markdown_to_raw(response.text)
207
+
208
+ return response_text
209
+
210
+
211
+ def main():
212
+ # GEMINI optimizes query
213
+ gr.ChatInterface(
214
+ predict,
215
+ title="LLM Research Helper",
216
+ description="""Start by inputing a brief description/title
217
+ of your research and our assistant will return a review of
218
+ related literature
219
+
220
+ ex. Finding optimal site locations for solar farms"""
221
+ ).launch()
222
+
223
+
224
+ if __name__ == '__main__':
225
+ parser = argparse.ArgumentParser(description="Literature review chatbot")
226
+ parser.add_argument("-o", "--optimize_query", help="Use query optimization (True, False)", default=False)
227
+ args = parser.parse_args()
228
+
229
+ optimize_query = args.optimize_query if args.optimize_query in [True, False] else False
230
+ main()
requirements.txt ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.4
3
+ aiosignal==1.3.1
4
+ altair==5.3.0
5
+ annotated-types==0.6.0
6
+ anyio==4.3.0
7
+ appnope==0.1.4
8
+ argon2-cffi==23.1.0
9
+ argon2-cffi-bindings==21.2.0
10
+ arrow==1.3.0
11
+ asttokens==2.4.1
12
+ async-lru==2.0.4
13
+ attrs==23.2.0
14
+ Babel==2.14.0
15
+ beautifulsoup4==4.12.3
16
+ bleach==6.1.0
17
+ cachetools==5.3.3
18
+ certifi==2024.2.2
19
+ cffi==1.16.0
20
+ charset-normalizer==3.3.2
21
+ click==8.1.7
22
+ comm==0.2.2
23
+ contourpy==1.2.1
24
+ cycler==0.12.1
25
+ datasets==2.18.0
26
+ debugpy==1.8.1
27
+ decorator==5.1.1
28
+ defusedxml==0.7.1
29
+ dill==0.3.8
30
+ distro==1.9.0
31
+ evaluate==0.4.1
32
+ executing==2.0.1
33
+ fastapi==0.110.1
34
+ fastjsonschema==2.19.1
35
+ ffmpy==0.3.2
36
+ filelock==3.13.4
37
+ fonttools==4.51.0
38
+ fqdn==1.5.1
39
+ frozenlist==1.4.1
40
+ fsspec==2024.2.0
41
+ google-ai-generativelanguage==0.6.1
42
+ google-api-core==2.18.0
43
+ google-api-python-client==2.125.0
44
+ google-auth==2.29.0
45
+ google-auth-httplib2==0.2.0
46
+ google-generativeai==0.5.0
47
+ googleapis-common-protos==1.63.0
48
+ gradio==4.26.0
49
+ gradio_client==0.15.1
50
+ grpcio==1.62.1
51
+ grpcio-status==1.62.1
52
+ h11==0.14.0
53
+ httpcore==1.0.5
54
+ httplib2==0.22.0
55
+ httpx==0.27.0
56
+ huggingface-hub==0.22.2
57
+ idna==3.7
58
+ importlib_resources==6.4.0
59
+ ipykernel==6.29.4
60
+ ipython==8.23.0
61
+ ipywidgets==8.1.2
62
+ isoduration==20.11.0
63
+ jedi==0.19.1
64
+ Jinja2==3.1.3
65
+ jiwer==3.0.3
66
+ json5==0.9.25
67
+ jsonpointer==2.4
68
+ jsonschema==4.21.1
69
+ jsonschema-specifications==2023.12.1
70
+ jupyter==1.0.0
71
+ jupyter-console==6.6.3
72
+ jupyter-events==0.10.0
73
+ jupyter-lsp==2.2.5
74
+ jupyter_client==8.6.1
75
+ jupyter_core==5.7.2
76
+ jupyter_server==2.14.0
77
+ jupyter_server_terminals==0.5.3
78
+ jupyterlab==4.1.6
79
+ jupyterlab_pygments==0.3.0
80
+ jupyterlab_server==2.26.0
81
+ jupyterlab_widgets==3.0.10
82
+ kiwisolver==1.4.5
83
+ markdown-it-py==3.0.0
84
+ MarkupSafe==2.1.5
85
+ matplotlib==3.8.4
86
+ matplotlib-inline==0.1.6
87
+ mdurl==0.1.2
88
+ mistune==3.0.2
89
+ multidict==6.0.5
90
+ multiprocess==0.70.16
91
+ nbclient==0.10.0
92
+ nbconvert==7.16.3
93
+ nbformat==5.10.4
94
+ nest-asyncio==1.6.0
95
+ notebook==7.1.2
96
+ notebook_shim==0.2.4
97
+ numpy==1.26.4
98
+ openai==1.17.1
99
+ orjson==3.10.0
100
+ overrides==7.7.0
101
+ packaging==24.0
102
+ pandas==2.2.2
103
+ pandocfilters==1.5.1
104
+ parso==0.8.4
105
+ pexpect==4.9.0
106
+ pillow==10.3.0
107
+ platformdirs==4.2.0
108
+ prometheus_client==0.20.0
109
+ prompt-toolkit==3.0.43
110
+ proto-plus==1.23.0
111
+ protobuf==4.25.3
112
+ psutil==5.9.8
113
+ ptyprocess==0.7.0
114
+ pure-eval==0.2.2
115
+ pyarrow==15.0.2
116
+ pyarrow-hotfix==0.6
117
+ pyasn1==0.6.0
118
+ pyasn1_modules==0.4.0
119
+ pycparser==2.22
120
+ pydantic==2.7.0
121
+ pydantic_core==2.18.1
122
+ pydub==0.25.1
123
+ Pygments==2.17.2
124
+ pyparsing==3.1.2
125
+ python-dateutil==2.9.0.post0
126
+ python-json-logger==2.0.7
127
+ python-multipart==0.0.9
128
+ pytz==2024.1
129
+ PyYAML==6.0.1
130
+ pyzmq==25.1.2
131
+ qtconsole==5.5.1
132
+ QtPy==2.4.1
133
+ rapidfuzz==3.8.1
134
+ referencing==0.34.0
135
+ requests==2.31.0
136
+ responses==0.18.0
137
+ rfc3339-validator==0.1.4
138
+ rfc3986-validator==0.1.1
139
+ rich==13.7.1
140
+ rpds-py==0.18.0
141
+ rsa==4.9
142
+ ruff==0.3.7
143
+ semantic-version==2.10.0
144
+ Send2Trash==1.8.3
145
+ setuptools==68.2.2
146
+ shellingham==1.5.4
147
+ six==1.16.0
148
+ sniffio==1.3.1
149
+ soupsieve==2.5
150
+ stack-data==0.6.3
151
+ starlette==0.37.2
152
+ terminado==0.18.1
153
+ tinycss2==1.2.1
154
+ tomlkit==0.12.0
155
+ toolz==0.12.1
156
+ tornado==6.4
157
+ tqdm==4.66.2
158
+ traitlets==5.14.2
159
+ typer==0.12.3
160
+ types-python-dateutil==2.9.0.20240316
161
+ typing_extensions==4.11.0
162
+ tzdata==2024.1
163
+ uri-template==1.3.0
164
+ uritemplate==4.1.1
165
+ urllib3==2.2.1
166
+ uvicorn==0.29.0
167
+ wcwidth==0.2.13
168
+ webcolors==1.13
169
+ webencodings==0.5.1
170
+ websocket-client==1.7.0
171
+ websockets==11.0.3
172
+ wheel==0.41.2
173
+ widgetsnbextension==4.0.10
174
+ xxhash==3.4.1
175
+ yarl==1.9.4