Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -17,8 +17,9 @@ import arxiv
|
|
| 17 |
import scholarly
|
| 18 |
import pymed
|
| 19 |
import wikipedia
|
| 20 |
-
#from
|
| 21 |
-
|
|
|
|
| 22 |
import pickle
|
| 23 |
import faiss
|
| 24 |
import threading
|
|
@@ -282,10 +283,10 @@ def tool_search_scholar(query: str, max_results: int = 5) -> list:
|
|
| 282 |
|
| 283 |
def extract_article_content(url: str) -> str:
|
| 284 |
try:
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
return
|
| 289 |
except Exception as e:
|
| 290 |
logger.error(f"Failed to extract article content from {url}: {e}")
|
| 291 |
return ""
|
|
@@ -575,14 +576,9 @@ def tool_draft_research_plan(prompt: str, entities: list, focus_areas: list = []
|
|
| 575 |
return "Could not generate a research plan due to an error."
|
| 576 |
|
| 577 |
def tool_extract_article(url: str) -> str:
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
if len(content) > MAX_FULL_TEXT_LENGTH:
|
| 583 |
-
content = content[:MAX_FULL_TEXT_LENGTH] + "... [content truncated]"
|
| 584 |
-
|
| 585 |
-
return content
|
| 586 |
|
| 587 |
tools = {
|
| 588 |
"search_web": {
|
|
@@ -679,7 +675,7 @@ tools = {
|
|
| 679 |
"description": "Identifies contradictions across multiple insights.",
|
| 680 |
"parameters": {
|
| 681 |
"insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
|
| 682 |
-
|
| 683 |
},
|
| 684 |
"identify_focus_areas": {
|
| 685 |
"function": tool_identify_focus_areas,
|
|
@@ -761,7 +757,7 @@ def deep_research(prompt):
|
|
| 761 |
context = research_data.get('context', [])
|
| 762 |
all_insights = research_data.get('all_insights', [])
|
| 763 |
entity_specific_insights = research_data.get('entity_specific_insights', {})
|
| 764 |
-
intermediate_output = ""
|
| 765 |
previous_queries = research_data.get('previous_queries', [])
|
| 766 |
failed_queries = research_data.get('failed_queries', [])
|
| 767 |
reasoning_context = research_data.get('reasoning_context', [])
|
|
@@ -772,12 +768,11 @@ def deep_research(prompt):
|
|
| 772 |
contradictions = research_data.get('contradictions', [])
|
| 773 |
research_session_id = research_data.get('research_session_id', str(uuid4()))
|
| 774 |
|
| 775 |
-
# Restore or initialize FAISS index
|
| 776 |
global index
|
| 777 |
if research_data:
|
| 778 |
logger.info("Restoring FAISS Index from loaded data.")
|
| 779 |
else:
|
| 780 |
-
index.reset()
|
| 781 |
logger.info("Initialized a fresh FAISS Index")
|
| 782 |
|
| 783 |
key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
|
|
@@ -793,14 +788,13 @@ def deep_research(prompt):
|
|
| 793 |
entity_progress[entity]['queries'] = research_data[entity]['queries']
|
| 794 |
entity_progress[entity]['insights'] = research_data[entity]['insights']
|
| 795 |
|
| 796 |
-
if
|
| 797 |
initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
|
| 798 |
research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
|
| 799 |
context.append(f"Initial Research Plan: {research_plan[:200]}...")
|
| 800 |
intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
|
| 801 |
focus_areas = initial_focus_areas
|
| 802 |
-
|
| 803 |
-
focus_areas = tool_identify_focus_areas(prompt=prompt, insights=all_insights, failed_areas=failed_areas)
|
| 804 |
|
| 805 |
for i in range(MAX_ITERATIONS):
|
| 806 |
if key_entities and i > 0:
|
|
@@ -811,8 +805,7 @@ def deep_research(prompt):
|
|
| 811 |
|
| 812 |
context.append(f"Current focus: {current_entity}")
|
| 813 |
|
| 814 |
-
|
| 815 |
-
if i > 0: # Don't do it on first iteration
|
| 816 |
faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
|
| 817 |
faiss_context = []
|
| 818 |
for idx in faiss_results_indices:
|
|
@@ -852,7 +845,7 @@ def deep_research(prompt):
|
|
| 852 |
entity_progress['general']['insights'].append(reasoning_output)
|
| 853 |
reasoning_context.append(reasoning_output)
|
| 854 |
context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
|
| 855 |
-
add_to_faiss_index(reasoning_output)
|
| 856 |
else:
|
| 857 |
failed_queries.append(initial_query)
|
| 858 |
context.append(f"Initial query yielded no relevant results: {initial_query}")
|
|
@@ -904,7 +897,7 @@ def deep_research(prompt):
|
|
| 904 |
entity_specific_insights[current_entity].append(entity_reasoning)
|
| 905 |
|
| 906 |
context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
|
| 907 |
-
add_to_faiss_index(entity_reasoning)
|
| 908 |
else:
|
| 909 |
failed_queries.append(entity_query)
|
| 910 |
context.append(f"Entity query for {current_entity} yielded no relevant results")
|
|
@@ -998,7 +991,7 @@ def deep_research(prompt):
|
|
| 998 |
entity_specific_insights[current_entity].append(result)
|
| 999 |
else:
|
| 1000 |
reasoning_context.append(result)
|
| 1001 |
-
add_to_faiss_index(result)
|
| 1002 |
all_insights.append(result)
|
| 1003 |
|
| 1004 |
elif tool_name == "critique_reasoning":
|
|
@@ -1040,7 +1033,7 @@ def deep_research(prompt):
|
|
| 1040 |
reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
|
| 1041 |
if reasoning_about_article:
|
| 1042 |
all_insights.append(reasoning_about_article)
|
| 1043 |
-
add_to_faiss_index(reasoning_about_article)
|
| 1044 |
|
| 1045 |
|
| 1046 |
elif tool_name == "meta_analyze":
|
|
@@ -1052,7 +1045,7 @@ def deep_research(prompt):
|
|
| 1052 |
if result:
|
| 1053 |
all_insights.append(result)
|
| 1054 |
context.append(f"Meta-analysis across entities: {result[:200]}...")
|
| 1055 |
-
add_to_faiss_index(result)
|
| 1056 |
|
| 1057 |
elif tool_name == "draft_research_plan":
|
| 1058 |
result = "Research plan already generated."
|
|
@@ -1077,7 +1070,6 @@ def deep_research(prompt):
|
|
| 1077 |
intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
|
| 1078 |
continue
|
| 1079 |
|
| 1080 |
-
# Save research data after each iteration
|
| 1081 |
research_data = {
|
| 1082 |
'context': context,
|
| 1083 |
'all_insights': all_insights,
|
|
@@ -1088,7 +1080,7 @@ def deep_research(prompt):
|
|
| 1088 |
'previous_critiques': previous_critiques,
|
| 1089 |
'focus_areas': focus_areas,
|
| 1090 |
'failed_areas': failed_areas,
|
| 1091 |
-
'seen_snippets': list(seen_snippets),
|
| 1092 |
'contradictions': contradictions,
|
| 1093 |
'research_session_id': research_session_id
|
| 1094 |
}
|
|
@@ -1134,8 +1126,6 @@ def deep_research(prompt):
|
|
| 1134 |
|
| 1135 |
return full_output
|
| 1136 |
|
| 1137 |
-
# Gradio Interface
|
| 1138 |
-
|
| 1139 |
custom_css = """
|
| 1140 |
.gradio-container {
|
| 1141 |
background-color: #f7f9fc;
|
|
@@ -1143,7 +1133,7 @@ custom_css = """
|
|
| 1143 |
.output-box {
|
| 1144 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 1145 |
line-height: 1.5;
|
| 1146 |
-
font-size: 14px;
|
| 1147 |
}
|
| 1148 |
h3 {
|
| 1149 |
color: #2c3e50;
|
|
@@ -1177,7 +1167,7 @@ iface = gr.Interface(
|
|
| 1177 |
theme="default",
|
| 1178 |
cache_examples=False,
|
| 1179 |
css=custom_css,
|
| 1180 |
-
allow_flagging="never",
|
| 1181 |
)
|
| 1182 |
|
| 1183 |
if __name__ == "__main__":
|
|
|
|
| 17 |
import scholarly
|
| 18 |
import pymed
|
| 19 |
import wikipedia
|
| 20 |
+
#from newspaper3k import Article # Removed newspaper3k
|
| 21 |
+
import trafilatura # Import trafilatura
|
| 22 |
+
from trafilatura import extract, fetch_url
|
| 23 |
import pickle
|
| 24 |
import faiss
|
| 25 |
import threading
|
|
|
|
| 283 |
|
| 284 |
def extract_article_content(url: str) -> str:
|
| 285 |
try:
|
| 286 |
+
downloaded = fetch_url(url)
|
| 287 |
+
if downloaded is None: # Handle potential download failures
|
| 288 |
+
return ""
|
| 289 |
+
return extract(downloaded, favor_precision=True) #Added favor_precision
|
| 290 |
except Exception as e:
|
| 291 |
logger.error(f"Failed to extract article content from {url}: {e}")
|
| 292 |
return ""
|
|
|
|
| 576 |
return "Could not generate a research plan due to an error."
|
| 577 |
|
| 578 |
def tool_extract_article(url: str) -> str:
|
| 579 |
+
# Use trafilatura's extraction function
|
| 580 |
+
extracted_text = extract_article_content(url)
|
| 581 |
+
return extracted_text if extracted_text else f"Could not extract content from {url}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
tools = {
|
| 584 |
"search_web": {
|
|
|
|
| 675 |
"description": "Identifies contradictions across multiple insights.",
|
| 676 |
"parameters": {
|
| 677 |
"insights": {"type": "array", "description": "Collection of insights to analyze for contradictions."},
|
| 678 |
+
},
|
| 679 |
},
|
| 680 |
"identify_focus_areas": {
|
| 681 |
"function": tool_identify_focus_areas,
|
|
|
|
| 757 |
context = research_data.get('context', [])
|
| 758 |
all_insights = research_data.get('all_insights', [])
|
| 759 |
entity_specific_insights = research_data.get('entity_specific_insights', {})
|
| 760 |
+
intermediate_output = ""
|
| 761 |
previous_queries = research_data.get('previous_queries', [])
|
| 762 |
failed_queries = research_data.get('failed_queries', [])
|
| 763 |
reasoning_context = research_data.get('reasoning_context', [])
|
|
|
|
| 768 |
contradictions = research_data.get('contradictions', [])
|
| 769 |
research_session_id = research_data.get('research_session_id', str(uuid4()))
|
| 770 |
|
|
|
|
| 771 |
global index
|
| 772 |
if research_data:
|
| 773 |
logger.info("Restoring FAISS Index from loaded data.")
|
| 774 |
else:
|
| 775 |
+
index.reset()
|
| 776 |
logger.info("Initialized a fresh FAISS Index")
|
| 777 |
|
| 778 |
key_entities_with_descriptions = tool_extract_key_entities(prompt=prompt)
|
|
|
|
| 788 |
entity_progress[entity]['queries'] = research_data[entity]['queries']
|
| 789 |
entity_progress[entity]['insights'] = research_data[entity]['insights']
|
| 790 |
|
| 791 |
+
if not focus_areas: # Corrected placement: outside the loop
|
| 792 |
initial_focus_areas = tool_identify_focus_areas(prompt=prompt)
|
| 793 |
research_plan = tool_draft_research_plan(prompt=prompt, entities=key_entities, focus_areas=initial_focus_areas)
|
| 794 |
context.append(f"Initial Research Plan: {research_plan[:200]}...")
|
| 795 |
intermediate_output += f"Initial Research Plan:\n{research_plan}\n\n"
|
| 796 |
focus_areas = initial_focus_areas
|
| 797 |
+
|
|
|
|
| 798 |
|
| 799 |
for i in range(MAX_ITERATIONS):
|
| 800 |
if key_entities and i > 0:
|
|
|
|
| 805 |
|
| 806 |
context.append(f"Current focus: {current_entity}")
|
| 807 |
|
| 808 |
+
if i > 0:
|
|
|
|
| 809 |
faiss_results_indices = search_faiss_index(prompt if current_entity == 'general' else f"{prompt} {current_entity}")
|
| 810 |
faiss_context = []
|
| 811 |
for idx in faiss_results_indices:
|
|
|
|
| 845 |
entity_progress['general']['insights'].append(reasoning_output)
|
| 846 |
reasoning_context.append(reasoning_output)
|
| 847 |
context.append(f"Initial Reasoning: {reasoning_output[:200]}...")
|
| 848 |
+
add_to_faiss_index(reasoning_output)
|
| 849 |
else:
|
| 850 |
failed_queries.append(initial_query)
|
| 851 |
context.append(f"Initial query yielded no relevant results: {initial_query}")
|
|
|
|
| 897 |
entity_specific_insights[current_entity].append(entity_reasoning)
|
| 898 |
|
| 899 |
context.append(f"Reasoning about {current_entity}: {entity_reasoning[:200]}...")
|
| 900 |
+
add_to_faiss_index(entity_reasoning)
|
| 901 |
else:
|
| 902 |
failed_queries.append(entity_query)
|
| 903 |
context.append(f"Entity query for {current_entity} yielded no relevant results")
|
|
|
|
| 991 |
entity_specific_insights[current_entity].append(result)
|
| 992 |
else:
|
| 993 |
reasoning_context.append(result)
|
| 994 |
+
add_to_faiss_index(result)
|
| 995 |
all_insights.append(result)
|
| 996 |
|
| 997 |
elif tool_name == "critique_reasoning":
|
|
|
|
| 1033 |
reasoning_about_article = tool_reason(prompt=prompt, search_results=[{"title": "Extracted Article", "snippet": result, "url": parameters['url']}])
|
| 1034 |
if reasoning_about_article:
|
| 1035 |
all_insights.append(reasoning_about_article)
|
| 1036 |
+
add_to_faiss_index(reasoning_about_article)
|
| 1037 |
|
| 1038 |
|
| 1039 |
elif tool_name == "meta_analyze":
|
|
|
|
| 1045 |
if result:
|
| 1046 |
all_insights.append(result)
|
| 1047 |
context.append(f"Meta-analysis across entities: {result[:200]}...")
|
| 1048 |
+
add_to_faiss_index(result)
|
| 1049 |
|
| 1050 |
elif tool_name == "draft_research_plan":
|
| 1051 |
result = "Research plan already generated."
|
|
|
|
| 1070 |
intermediate_output += f"Iteration {i+1} - Error: {str(e)}\n"
|
| 1071 |
continue
|
| 1072 |
|
|
|
|
| 1073 |
research_data = {
|
| 1074 |
'context': context,
|
| 1075 |
'all_insights': all_insights,
|
|
|
|
| 1080 |
'previous_critiques': previous_critiques,
|
| 1081 |
'focus_areas': focus_areas,
|
| 1082 |
'failed_areas': failed_areas,
|
| 1083 |
+
'seen_snippets': list(seen_snippets),
|
| 1084 |
'contradictions': contradictions,
|
| 1085 |
'research_session_id': research_session_id
|
| 1086 |
}
|
|
|
|
| 1126 |
|
| 1127 |
return full_output
|
| 1128 |
|
|
|
|
|
|
|
| 1129 |
custom_css = """
|
| 1130 |
.gradio-container {
|
| 1131 |
background-color: #f7f9fc;
|
|
|
|
| 1133 |
.output-box {
|
| 1134 |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 1135 |
line-height: 1.5;
|
| 1136 |
+
font-size: 14px;
|
| 1137 |
}
|
| 1138 |
h3 {
|
| 1139 |
color: #2c3e50;
|
|
|
|
| 1167 |
theme="default",
|
| 1168 |
cache_examples=False,
|
| 1169 |
css=custom_css,
|
| 1170 |
+
allow_flagging="never",
|
| 1171 |
)
|
| 1172 |
|
| 1173 |
if __name__ == "__main__":
|