Spaces:

prthm11
/

Database_Agent

Runtime error

App Files Files Community

prthm11 commited on Aug 25, 2025

Commit

28d340c

verified ·

1 Parent(s): 1767a62

Upload app.py

Browse files

Files changed (1) hide show

app.py +630 -0

app.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# --- IMPORTS ---
+from werkzeug.exceptions import TooManyRequests
+from flask import Flask, request, jsonify, render_template
+from flask_socketio import SocketIO, emit
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.agents import initialize_agent, AgentType, create_react_agent, AgentExecutor
+from langchain_community.agent_toolkits import create_sql_agent, SQLDatabaseToolkit
+from langchain_community.utilities import SQLDatabase
+from langchain.tools import Tool
+from langchain.memory import ConversationBufferMemory
+from pymongo import MongoClient
+import threading
+import os, uuid
+import re
+import traceback
+import ast
+from bson import json_util
+from dotenv import load_dotenv
+from werkzeug.utils import secure_filename
+from werkzeug.exceptions import HTTPException
+from langchain.prompts import ChatPromptTemplate
+from tabulate import tabulate
+from fuzzywuzzy import fuzz
+# from langchain_groq import ChatGroq
+from datetime import datetime
+def error_safe(f):
+    def wrapper(*args, **kwargs):
+        try:
+            return f(*args, **kwargs)
+        except HTTPException as he:
+            return jsonify({"status": "error", "message": he.description}), he.code
+        except Exception as e:
+            print("[ERROR] Uncaught Exception in", f.__name__)
+            traceback.print_exc()
+            return jsonify({"status": "error", "message": str(e)}), 500
+    wrapper.__name__ = f.__name__
+    return wrapper
+# --- ENV + FLASK SETUP ---
+load_dotenv()
+os.environ["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
+app = Flask(__name__)
+app.config['SECRET_KEY'] = os.urandom(32)
+app.config['UPLOAD_FOLDER'] = 'uploads'
+socketio = SocketIO(app, cors_allowed_origins="*")
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+llm = ChatGoogleGenerativeAI(
+    temperature=0.2,
+    model="gemini-2.0-flash",
+    max_retries=50,
+    api_key=os.getenv("GEMINI_API_KEY")
+)
+# llm = ChatGroq(temperature=0.2, model_name="mistral-saba-24b",api_key=os.getenv("GROQ_API_KEY"))
+# --- GLOBALS ---
+agent_executor = None
+memory = ConversationBufferMemory(
+    memory_key="chat_history", return_messages=True, input_key="input")
+mongo_db = None
+client = None
+db_mode = None  # "mongo" or "sql"
+# --- SHARED ---
+def is_schema_request(prompt: str) -> bool:
+    pattern = re.compile(
+        r'\b(schema|table names|tables|columns|structure|column names|collections?|field names|metadata|describe|show)\b', re.IGNORECASE)
+    return bool(pattern.search(prompt))
+def is_sensitive_request(prompt: str) -> bool:
+    sensitive_keywords = [
+        "password", "token", "credential", "secret", "api key", "schema", "structure",
+        "collection name", "field name", "user_id", "order_id", "payment_id",
+        "internal", "database structure", "table structure", "email", "phone", "contact", "ssn"
+    ]
+    lowered = prompt.lower()
+    return any(keyword in lowered for keyword in sensitive_keywords)
+intent_prompt = ChatPromptTemplate.from_messages([
+    ("system", "Classify if the user is asking schema/structure/sensitive info (tables, columns, schema): YES or NO."),
+    ("human", "{prompt}")
+])
+intent_checker = intent_prompt | llm
+def is_schema_leak_request(prompt):
+    try:
+        classification = intent_checker.invoke({"prompt": prompt})
+        return "yes" in classification.content.strip().lower()
+    except:
+        return False
+# --- INIT SQL AGENT ---
+def init_sql_agent(db_path):
+    global agent_executor, db_mode
+    db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
+    toolkit = SQLDatabaseToolkit(db=db, llm=llm)
+    prefix = '''You are a helpful SQL expert agent that ALWAYS returns natural language answers using the tools.'''
+    # Always format your responses in Markdown. For example:
+    # - Use bullet points
+    # - Use bold for headers
+    # - Wrap code in triple backticks
+    # - Tables should use Markdown table syntax
+    # You must NEVER:
+    # - Show or mention SQL syntax.
+    # - Reveal table names, column names, or database schema.
+    # - Respond with any technical details or structure of the database.
+    # - Return code or tool names.
+    # - Give wrong Answers.
+    # You must ALWAYS:
+    # - Respond in plain, friendly language.
+    # - Don't Summarize the result for the user (e.g., "There are 9 tables in the system.")
+    # - If asked to list table names or schema, politely refuse and respond with:
+    #     "I'm sorry, I can't share database structure information."
+    # - ALWAYS HAVE TO SOLVE COMPLEX USER QUERIES. FOR THAT, UNDERSTAND THE PROMPT, ANALYSE PROPER AND THEN GIVE ANSWER.
+    # - Your Answers should be correct, you have to do understand process well and give accurate answers.
+    # - IF USER ASK ABOUT DATA, Which is not there in a database, then GIVE FOLLOWING ANSWER:
+    #     "There is no such data in the Database."
+    # Strict Rules You MUST Follow:
+    # - NEVER display or mention SQL queries.
+    # - NEVER explain SQL syntax or logic.
+    # - NEVER return technical or code-like responses.
+    # - ONLY respond in natural, human-friendly language.
+    # - You are not allow to give the name of any COLUMNS, TABLES, DATABASE, ENTITY, SYNTAX, STRUCTURE, DESIGN, ETC...
+    # If the user asks for anything other than retrieving data (SELECT), respond using this exact message:
+    #     "I'm not allowed to perform operations other than SELECT queries. Please ask something that involves reading data."
+    # Do not return SQL queries or raw technical responses to the user.
+    # For example:
+    # Wrong: SELECT * FROM ...
+    # Correct: The user assigned to the cart is Alice Smith.
+    # Use the tools provided to get the correct data from the database and summarize the response clearly.
+    # If the input is unclear or lacks sufficient data, ask for clarification using the SubmitFinalAnswer tool.
+    # Never return SQL queries as your response.
+    # If you cannot find an answer,
+    # Double-check your query and running it again.
+    # - If a query fails, revise and try again.
+    # - Else 'No data found' using SubmitFinalAnswer.No SQL, no code. '''
+    agent_executor = create_sql_agent(
+        llm=llm,
+        toolkit=toolkit,
+        verbose=True,
+        prefix=prefix,
+        agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+        memory=memory,
+        agent_executor_kwargs={"handle_parsing_errors": True},
+    )
+    db_mode = "sql"
+# --- INIT MONGO AGENT ---
+system_message = """
+        You are **MongoDBQueryBot**, a highly intelligent and accurate assistant for answering questions about data stored in a MongoDB database using tools.
+        """
+# ### 🚨 Critical Instructions (Strictly Follow These):
+# - You **must always** use tools provided to answer user questions.
+# - Always join IDs with associated human-readable values like names or titles when answering.
+# - Prefer displaying `user name`, `employee name`, or `product name` instead of internal IDs like `user_id`, `emp_id`, or `product_id`.
+# - Avoid responding only with technical identifiers. Make responses meaningful to users.
+# - **Never** guess or fabricate any information.
+# - **Do not** show raw JSON, field names, or database structure.
+# - Your role is **read-only**: do not suggest or perform insert/update/delete.
+# - After Using All the available tools, if you are Unable to find any documents, then give followig ANSWER:
+#     "Please, rephrase your query because I can't exactly understand, what you want !"
+# - If a query can't be answered or is unrelated to reading data, reply:
+# ❌ "I'm only allowed to retrieve data. Please ask a query involving reading information."
+# - IF USER ASK ABOUT DATA, Which is not there in a database, then GIVE FOLLOWING ANSWER:
+#             "There is no such data in the Database."
+# - When returning answers:
+#     - Do **not return internal IDs** like `user_id`, `order_id`, `payment_id`, etc.
+#     - Instead, use human-readable fields like `name`, `full_name`, `user_name`, etc., from related collections.
+#     - If only an ID is available, try joining the relevant collections to fetch the proper display name.
+# ### 🧠 How to Think:
+# - Understand **exactly** what the user is trying to ask. Do not answer if unclear — ask for clarification.
+# - Translate the user prompt into tool inputs by identifying:
+# - Which collection to search
+# - What value or field they're referring to
+# - The correct format expected by the tool
+# ### 🛠️ Tool Usage Guide:
+# - Use `FindDocuments` for queries like:
+# - "Show me all employees named John"
+# - "What is the salary of Manager X?"
+# - Use `ListCollections` to discover available data types (but don’t share them directly).
+# - **IMPORTANT : Don't Iterate only in one tool, if you can't able to answer using current tool you using, then swith the tool !**
+# - Use `JoinCollections` to resolve IDs into names when the question asks about people, customers, or products.
+# - When resolving names from payments, use this format:
+#   `from=Payments, key=order_id, to=Orders, match=order_id, next_key=user_id, next_to=Users, next_match=user_id, return=name`
+# - Your goal is to **return the person's name** (e.g., `name`, `user_name`, `full_name`) not their ID.
+# - Always prioritize returning names instead of internal identifiers.
+# - Examples:
+#     - For payment-related questions → Join Payments → Orders → Users and return name
+#     - For order questions → Join Orders → Users and return user names
+# ### 🧾 Response Format:
+# - Use **clear markdown with tables** when displaying data.
+# - If no data is found: return `**No documents found.**`
+# - Stay professional, brief, and relevant.
+# ### 🚫 Never Do This:
+# - Do not leak MongoDB structure, schema, or field names.
+# - Do not suggest code, MongoDB syntax, or field mappings.
+# - Do not hallucinate or make assumptions.
+# Start by analyzing the prompt carefully, select the right tool, invoke it, and return a user-friendly answer based on the result.
+# """
+def find_docs_tool_func(query: str) -> str:
+    """
+    Flexible MongoDB search with fallback:
+    - First tries in specified collection.
+    - If no results found, falls back to search across all collections.
+    Input format:
+    - collection=<collection>, key=<field>, value=<value>
+    - OR: collection=<collection>, value=<value>
+    """
+    try:
+        parts = dict(part.strip().split("=", 1)
+                     for part in query.split(",") if "=" in part)
+        collection = parts.get("collection")
+        key = parts.get("key")
+        value = parts.get("value")
+        if not collection:
+            return "❌ 'collection' is required."
+        def query_collection(coll_name):
+            if key and value:
+                return list(mongo_db[coll_name].find({key: value}, {'_id': 0}))
+            elif value:
+                return [doc for doc in mongo_db[coll_name].find({}, {'_id': 0}) if any(str(v).lower() == value.lower() for v in doc.values())]
+            else:
+                return list(mongo_db[coll_name].find({}, {'_id': 0}))
+        docs = query_collection(collection)
+        if docs:
+            return "\n markdown\n" + tabulate(docs, headers="keys", tablefmt="github") + "\n"
+        for coll in mongo_db.list_collection_names():
+            if coll == collection:
+                continue
+            docs = query_collection(coll)
+            if docs:
+                return "\n markdown\n" + tabulate(docs, headers="keys", tablefmt="github") + "\n"
+        return "**No documents found.**"
+    except Exception as e:
+        return f"Invalid input format or error: {str(e)}"
+def aggregate_group_by(_input: str):
+    try:
+        if _input.strip().startswith("{"):
+            # Parse JSON-like string
+            args = ast.literal_eval(_input)
+            collection = args.get("collection_name") or args.get("collection")
+            field = args.get("group_by") or args.get("field")
+        else:
+            # Handle legacy input format
+            args = dict(x.split("=") for x in _input.split(","))
+            collection = args["collection"]
+            field = args["field"]
+        pipeline = [
+            {"$group": {"_id": f"${field}", "count": {"$sum": 1}}},
+            {"$project": {"_id": 0, field: "$_id", "count": 1}}
+        ]
+        result = list(mongo_db[collection].aggregate(pipeline))
+        if not result:
+            return "**No data found.**"
+        return "\n markdown\n" + tabulate(result, headers="keys", tablefmt="github") + "\n"
+    except Exception as e:
+        return f"Aggregation failed: {e}"
+def get_all_documents(collection: str):
+    try:
+        docs = list(mongo_db[collection].find({}, {'_id': 0}))
+        if not docs:
+            return "**No documents found.**"
+        return "\n markdown\n" + tabulate(docs, headers="keys", tablefmt="github") + "\n"
+    except Exception as e:
+        return f"Error fetching documents: {e}"
+def fuzzy_find_documents(query: str):
+    try:
+        parts = dict(part.strip().split("=", 1) for part in query.split(","))
+        collection = parts["collection"]
+        value = parts["value"]
+        threshold = int(parts.get("threshold", 80))
+        matches = []
+        for doc in mongo_db[collection].find({}, {'_id': 0}):
+            if any(fuzz.partial_ratio(str(v).lower(), value.lower()) >= threshold for v in doc.values()):
+                matches.append(doc)
+        if not matches:
+            return "**No fuzzy matches found.**"
+        return "\n markdown\n" + tabulate(matches, headers="keys", tablefmt="github") + "\n"
+    except Exception as e:
+        return f"Fuzzy match error: {e}"
+# def join_collections_tool_func(_input: str):
+#     try:
+#         # Parse input like: from=Products, key=category_id, to=Categories, match=category_id, return=category_name
+#         args = dict(x.strip().split("=", 1) for x in _input.split(","))
+#         from_collection = args["from"]
+#         foreign_key = args["key"]
+#         to_collection = args["to"]
+#         match_key = args["match"]
+#         return_field = args["return"]
+#         results = []
+#         foreign_lookup = {
+#             doc[match_key]: doc.get(return_field)
+#             for doc in mongo_db[to_collection].find()
+#             if match_key in doc
+#         }
+#         for doc in mongo_db[from_collection].find({}, {'_id': 0}):
+#             doc[return_field] = foreign_lookup.get(doc.get(foreign_key), "Unknown")
+#             results.append(doc)
+#         if not results:
+#             return "**No documents found.**"
+#         return "\n markdown\n" + tabulate(results, headers="keys", tablefmt="github") + "\n"
+#     except Exception as e:
+#         return f"Join failed: {e}"
+def join_collections_tool_func(_input: str):
+    """
+    Supports 2-level join (Payments → Orders → Users) or any pair-wise join
+    Input formats:
+    - from=Payments, key=order_id, to=Orders, match=order_id, next_key=user_id, next_to=Users, next_match=user_id, return=name
+    - from=Products, key=category_id, to=Categories, match=category_id, return=category_name
+    """
+    try:
+        args = dict(x.strip().split("=", 1) for x in _input.split(","))
+        from_coll = args["from"]
+        key = args["key"]
+        to_coll = args["to"]
+        match = args["match"]
+        return_field = args["return"]
+        next_key = args.get("next_key")
+        next_to = args.get("next_to")
+        next_match = args.get("next_match")
+        # First join (e.g., Payments → Orders)
+        to_docs = {doc[match]: doc for doc in mongo_db[to_coll].find()
+                   if match in doc}
+        joined = []
+        for doc in mongo_db[from_coll].find({}, {'_id': 0}):
+            foreign_doc = to_docs.get(doc.get(key))
+            if not foreign_doc:
+                continue
+            merged = {**doc, **foreign_doc}
+            joined.append(merged)
+        # Second join (e.g., Orders → Users)
+        if next_key and next_to and next_match:
+            next_docs = {
+                doc[next_match]: doc for doc in mongo_db[next_to].find() if next_match in doc}
+            for doc in joined:
+                user_doc = next_docs.get(doc.get(next_key))
+                if user_doc:
+                    doc[return_field] = user_doc.get(return_field, "Unknown")
+                else:
+                    doc[return_field] = "Unknown"
+        # Prepare final result
+        if not joined:
+            return "**No documents found.**"
+        final = [{return_field: doc.get(return_field)}
+                 for doc in joined if return_field in doc]
+        return "\n markdown\n" + tabulate(final, headers="keys", tablefmt="github") + "\n"
+    except Exception as e:
+        return f"Join failed: {e}"
+def smart_join_router(prompt: str) -> str:
+    """
+    An intelligent router that suggests the correct JoinCollections input string
+    for common user intent like payments → orders → users → name.
+    """
+    prompt_lower = prompt.lower()
+    if "payment" in prompt_lower and any(term in prompt_lower for term in ["who", "name", "user", "person"]):
+        return "from=Payments, key=order_id, to=Orders, match=order_id, next_key=user_id, next_to=Users, next_match=user_id, return=name"
+    elif "order" in prompt_lower and "name" in prompt_lower:
+        return "from=Orders, key=user_id, to=Users, match=user_id, return=name"
+    # Extend as needed
+    return "Unable to auto-generate join path. Please provide more context."
+def init_mongo_agent(json_path):
+    global agent_executor, client, mongo_db, db_mode
+    client = MongoClient("mongodb://localhost:27017/")
+    mongo_db = client['uploaded_mongo']
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json_util.loads(f.read())
+    # Handle both single-collection and multi-collection formats
+    if isinstance(data, list):
+        # Default collection name if only a list is provided
+        collection = mongo_db['default_collection']
+        collection.drop()
+        collection.insert_many(data)
+    elif isinstance(data, dict):
+        for col_name, docs in data.items():
+            collection = mongo_db[col_name]
+            collection.drop()
+            if isinstance(docs, list):
+                collection.insert_many(docs)
+            else:
+                collection.insert_one(docs)
+    else:
+        raise ValueError("Unsupported JSON format. Must be a list or dict.")
+    def list_collections(_input=None):
+        return mongo_db.list_collection_names()
+    find_docs_tool = Tool(
+        name="FindDocuments",
+        description=(
+            "Use this tool to find documents in a MongoDB collection.\n"
+            "Input format:\n"
+            "- `collection=<collection>, key=<field>, value=<value>` for precise queries\n"
+            "- OR `collection=<collection>, value=<value>` to search across all fields\n"
+            "If `key` is omitted, the tool will automatically scan all fields to find matching values.\n"
+            "Examples:\n"
+            "- `collection=default_collection, key=name, value=Lauren Alexander`\n"
+            "- `collection=default_collection, value=Lauren Alexander`"
+        ),
+        func=find_docs_tool_func)
+    aggregate_tool = Tool(
+        name="AggregateGroupBy",
+        func=aggregate_group_by,
+        description=(
+            "Group documents and count by any field. Format: collection=<name>, field=<group_by_field>. E.g., collection=residents, field=gender"
+        )
+    )
+    get_all_documents_tool = Tool(
+        name="GetAllDocuments",
+        func=get_all_documents,
+        description=(
+            "Fetch all documents from a collection. Input: collection name only. Example: residents"
+        )
+    )
+    fuzzy_tool = Tool(
+        name="FuzzyFindDocuments",
+        func=fuzzy_find_documents,
+        description=("Fuzzy match documents across all fields in a collection. Format: collection=<name>, value=<search_term>, threshold=80 (optional)"
+                     )
+    )
+    join_collection_tool = Tool(
+        name="JoinCollections",
+        func=join_collections_tool_func,
+        description=(
+            "Join collections to map foreign keys to human-readable values. Supports 1 or 2-level joins.\n"
+            "Formats:\n"
+            "- from=Payments, key=order_id, to=Orders, match=order_id, return=status\n"
+            "- from=Payments, key=order_id, to=Orders, match=order_id, next_key=user_id, next_to=Users, next_match=user_id, return=name"
+        )
+    )
+    smart_router_tool = Tool(
+        name="SmartJoinRouter",
+        func=smart_join_router,
+        description=(
+            "Suggest the correct JoinCollections input format based on user intent.\n"
+            "Use this when you are unsure how to form the join input."
+        )
+    )
+    tools = [
+        Tool(name="FindDocuments", func=find_docs_tool,
+             description="Flexible MongoDB search..."),
+        Tool(name="ListCollections", func=lambda x: list_collections(),
+             description="List all collections..."),
+        Tool(name="AggregateGroupBy", func=aggregate_tool,
+             description="Group and count by any field..."),
+        Tool(name="GetAllDocuments", func=get_all_documents_tool,
+             description="Fetch all documents from a collection..."),
+        Tool(name="FuzzyFindDocuments", func=fuzzy_tool,
+             description="Fuzzy match documents across all fields..."),
+        Tool(name="JoinCollections", func=join_collection_tool,
+             description="Join related collections to return names instead of IDs..."),
+        Tool(name="SmartJoinCollections", func=smart_router_tool,
+             description="Smrt Join related collections to return names instead of IDs...")
+    ]
+    agent_executor = initialize_agent(
+        tools=tools,
+        llm=llm,
+        agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
+        memory=memory,
+        verbose=True,
+        prefix=system_message,
+        handle_parsing_errors=True
+    )
+    db_mode = "mongo"
+@app.errorhandler(Exception)
+def handle_all_errors(e):
+    print(f"[ERROR] Global handler caught an exception: {str(e)}")
+    traceback.print_exc()
+    if isinstance(e, HTTPException):
+        return jsonify({"status": "error", "message": e.description}), e.code
+    return jsonify({"status": "error", "message": "An unexpected error occurred"}), 500
+@app.errorhandler(TooManyRequests)
+def handle_429_error(e):
+    return jsonify({
+        "status": "error",
+        "message": "🚦 Agent is busy, try again after sometime."
+    }), 429
+# --- ROUTES ---
+@app.route("/")
+def index():
+    return render_template("app_index.html")
+@app.route("/upload_db", methods=["POST"])
+@error_safe
+def upload_db():
+    file = request.files.get("file")
+    if not file or file.filename == "":
+        return jsonify(success=False, message="No file provided"), 400
+    filename = secure_filename(file.filename)
+    path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
+    file.save(path)
+    try:
+        if filename.endswith(".json"):
+            init_mongo_agent(path)
+            mongo_db = globals().get("mongo_db")
+            db_name = getattr(mongo_db, "name", None) or os.path.splitext(filename)[0]
+            return jsonify({"database_name": db_name, "message": "MongoDB initialized"}), 200
+            # return jsonify(success=True, message="MongoDB initialized")
+        # elif filename.endswith(".db"):
+        #     init_sql_agent(path)
+        #     return jsonify(success=True, message="SQL DB initialized")
+        # SQL DB (.db or .sqlite)
+        elif filename.lower().endswith(".db") or filename.lower().endswith(".sqlite"):
+            init_sql_agent(path)  # your existing initializer
+            db_name = os.path.splitext(filename)[0]
+            return jsonify({"database_name": db_name, "message": "SQL DB initialized"}), 200
+        else:
+            return jsonify(success=False, message="Unsupported file format"), 400
+    except Exception as e:
+        traceback.print_exc()
+        return jsonify(success=False, message=f"Init failed: {e}"), 500
+@app.route("/generate", methods=["POST"])
+@error_safe
+def generate():
+    try:
+        data = request.get_json(force=True) or {}
+        prompt = data.get("prompt", "").strip()
+        if not prompt:
+            return jsonify({"status": "error", "message": "Prompt is required"}), 400
+    except Exception:
+        traceback.print_exc()
+        return jsonify({"status": "error", "message": "Invalid input"}), 400
+    try:
+        # invoke your agent synchronously
+        result = agent_executor.invoke({"input": prompt})
+        # Normalize final_answer from agent output safely
+        if isinstance(result, dict):
+            final_answer = (
+                result.get("final_answer")
+                or result.get("output")
+                or result.get("answer")
+                or result.get("text")
+                or ""
+            )
+        else:
+            final_answer = str(result or "")
+        if final_answer is None:
+            final_answer = ""
+        # Optionally keep emitting to socket so clients listening to socketio still get it
+        try:
+            socketio.emit("final", {"message": final_answer})
+        except Exception:
+            app.logger.debug("socket emit failed, continuing")
+        return jsonify({"final_answer": final_answer, "prompt": prompt}), 200
+    except Exception as e:
+        app.logger.exception("Agent invocation failed")
+        return jsonify({"prompt": prompt, "final_answer": "", "message": f"Agent error: {str(e)[:200]}"}), 500
+if __name__ == "__main__":
+    socketio.run(app, debug=True)