|
|
import re |
|
|
import time |
|
|
import random |
|
|
import gradio as gr |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
|
|
|
ENABLE_SCRAPING = False |
|
|
SITE_URL = "https://your-agri-future-site.com" |
|
|
|
|
|
|
|
|
knowledge_base = "" |
|
|
|
|
|
|
|
|
if ENABLE_SCRAPING: |
|
|
try: |
|
|
from selenium import webdriver |
|
|
from selenium.webdriver.chrome.options import Options |
|
|
from selenium.webdriver.common.by import By |
|
|
|
|
|
def scrape_site(url): |
|
|
options = Options() |
|
|
options.headless = True |
|
|
driver = webdriver.Chrome(options=options) |
|
|
driver.get(url) |
|
|
|
|
|
time.sleep(5) |
|
|
try: |
|
|
|
|
|
content_element = driver.find_element(By.ID, "content") |
|
|
page_text = content_element.text |
|
|
except Exception as e: |
|
|
page_text = "Error encountered during scraping: " + str(e) |
|
|
driver.quit() |
|
|
return page_text |
|
|
|
|
|
knowledge_base = scrape_site(SITE_URL) |
|
|
print("Scraped knowledge base successfully.") |
|
|
except Exception as e: |
|
|
print("Scraping failed or Selenium is not configured:", e) |
|
|
else: |
|
|
print("Scraping is disabled; proceeding without scraped site content.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_greeting(query: str, lang: str) -> bool: |
|
|
greetings = { |
|
|
"en": ["hello", "hi", "hey", "good morning", "good afternoon", "good evening"], |
|
|
"fr": ["bonjour", "salut", "coucou", "bonsoir"], |
|
|
"am": ["ሰላም", "ሰላም እንደምን", "እንዴት"] |
|
|
} |
|
|
|
|
|
greet_list = greetings.get(lang, greetings["en"]) |
|
|
|
|
|
if lang != "am": |
|
|
query = query.lower() |
|
|
return any(query.startswith(greet) for greet in greet_list) |
|
|
|
|
|
|
|
|
def get_out_of_scope_message(lang: str) -> str: |
|
|
messages = { |
|
|
"en": [ |
|
|
"I appreciate your curiosity. However, my expertise lies exclusively in agricultural and agro-investment insights. Could you please frame your question accordingly?", |
|
|
"That’s an interesting thought, but I'm tailored specifically for topics concerning agriculture and agro-investment. Please ask a question within that realm.", |
|
|
"While I value your inquiry, I'm optimized to provide insights solely on agriculture and related investment matters. Could you rephrase your query to align with these topics?", |
|
|
"It appears your question may not be directly tied to agriculture or agro-investment. Please ask something along those lines so I can assist effectively." |
|
|
], |
|
|
"fr": [ |
|
|
"J'apprécie votre curiosité. Cependant, mon expertise se limite exclusivement aux informations sur l'agriculture et les investissements agroalimentaires. Pourriez-vous reformuler votre question en ce sens ?", |
|
|
"C'est une pensée intéressante, mais je suis spécialisé dans les domaines de l'agriculture et des investissements agroalimentaires. Merci de poser une question dans ce domaine.", |
|
|
"Bien que votre question soit pertinente, je me concentre uniquement sur l'agriculture et les investissements associés. Pourriez-vous reformuler votre demande en conséquence ?", |
|
|
"Votre interrogation semble éloignée de l'agriculture ou des investissements agroalimentaires. Merci de poser une question dans ces domaines pour que je puisse vous aider efficacement." |
|
|
], |
|
|
"am": [ |
|
|
"እባክዎ ልጠይቁት ጥያቄ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ዙሪያ መሆኑን አላስቀምጥም። እባኮትን ጥያቄዎን እንደዚህ በማቅረብ ደግሞ ይሞክሩ።", |
|
|
"ልዩ ጥያቄዎችን ማቅረብ ይፈልጋሉ እንጂ፣ እኔ በተለይ በግብርናና በአገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች ላይ ብቻ እንደሚሰራ ተዘጋጅቻለሁ። እባክዎ ጥያቄዎን በእነዚህ ክስተቶች ውስጥ ያቅርቡ።", |
|
|
"እንደምታዩት ጥያቄዎ በግብርና ወይም በአገልግሎት ስርዓተ-ቢዝነስ ላይ የተመረጠ አይደለም። እባክዎ በዚህ አውድ የሆነ ጥያቄ ይጠይቁ።" |
|
|
] |
|
|
} |
|
|
|
|
|
return random.choice(messages.get(lang, messages["en"])) |
|
|
|
|
|
|
|
|
def is_domain_query(query: str) -> bool: |
|
|
domain_keywords = [ |
|
|
"agriculture", "farming", "crop", "agro", "investment", "soil", |
|
|
"irrigation", "harvest", "organic", "sustainable", "agribusiness", |
|
|
"livestock", |
|
|
"agriculture", "agroalimentaire", "agriculture durable" |
|
|
] |
|
|
return any(re.search(r"\b" + keyword + r"\b", query, re.IGNORECASE) for keyword in domain_keywords) |
|
|
|
|
|
def retrieve_relevant_snippet(query: str, text: str, max_length: int = 300) -> str: |
|
|
""" |
|
|
A simple retrieval function that searches for sentences in the text |
|
|
containing domain keywords from the query. |
|
|
Returns a snippet limited to max_length characters. |
|
|
""" |
|
|
sentences = re.split(r'[.?!]', text) |
|
|
for sentence in sentences: |
|
|
if is_domain_query(sentence) and all(word.lower() in sentence.lower() for word in query.split()): |
|
|
snippet = sentence.strip() |
|
|
return snippet[:max_length] + "..." if len(snippet) > max_length else snippet |
|
|
return "" |
|
|
|
|
|
|
|
|
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") |
|
|
|
|
|
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, language): |
|
|
|
|
|
|
|
|
|
|
|
if is_greeting(message, language): |
|
|
greetings = { |
|
|
"en": "Hello! How can I assist you today with your agriculture or agro-investment inquiries?", |
|
|
"fr": "Bonjour! Comment puis-je vous aider aujourd'hui en matière d'agriculture ou d'investissements agroalimentaires?", |
|
|
"am": "ሰላም! ዛሬ ስለ ግብርና ወይም ስለ አገልግሎት ስርዓተ-ቢዝነስ ጥያቄዎች እንዴት ልረዳዎት?" |
|
|
} |
|
|
yield greetings.get(language, greetings["en"]) |
|
|
return |
|
|
|
|
|
|
|
|
if not is_domain_query(message): |
|
|
yield get_out_of_scope_message(language) |
|
|
return |
|
|
|
|
|
|
|
|
messages_context = [{"role": "system", "content": system_message}] |
|
|
for user_msg, assistant_msg in history: |
|
|
if user_msg: |
|
|
messages_context.append({"role": "user", "content": user_msg}) |
|
|
if assistant_msg: |
|
|
messages_context.append({"role": "assistant", "content": assistant_msg}) |
|
|
|
|
|
|
|
|
if knowledge_base: |
|
|
snippet = retrieve_relevant_snippet(message, knowledge_base) |
|
|
if snippet: |
|
|
retrieval_context = f"Reference info from Agri Future Investment platform: {snippet}" |
|
|
messages_context.insert(0, {"role": "system", "content": retrieval_context}) |
|
|
|
|
|
|
|
|
messages_context.append({"role": "user", "content": message}) |
|
|
|
|
|
|
|
|
response = "" |
|
|
for message_resp in client.chat_completion( |
|
|
messages_context, |
|
|
max_tokens=max_tokens, |
|
|
stream=True, |
|
|
temperature=temperature, |
|
|
top_p=top_p, |
|
|
): |
|
|
token = message_resp.choices[0].delta.content |
|
|
response += token |
|
|
yield response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
respond, |
|
|
additional_inputs=[ |
|
|
gr.Textbox( |
|
|
value="You are AgriFutureBot, designed to help visitors of the Agri Future Investment platform understand content about the site and answer questions strictly related to agriculture and agro-investment topics.", |
|
|
label="System Message" |
|
|
), |
|
|
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"), |
|
|
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
|
|
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)"), |
|
|
gr.Dropdown(choices=["en", "fr", "am"], value="en", label="Language (en, fr, am)") |
|
|
], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |