NASA-SMD-PCRAG-SCDD-GEN

Running

File size: 21,077 Bytes

5cf6588
536372f
 
 
 
 
 
c7af9e1
cb31dc6
 
 
 
c7af9e1
 
31952a4
9ef9a98
8b8df4a
1a1b0fb
 
66e99a9
a38e8fb
31952a4
20010b0
 
66e99a9
 
 
 
 
ed12800
66e99a9
 
 
536372f
 
 
 
 
 
 
 
 
 
8b8df4a
 
 
 
 
 
260e2b7
 
f978093
 
66e99a9
f978093
 
 
 
 
c70bfa4
74f3b52
 
b8bed0e
74f3b52
6606654
b8bed0e
74f3b52
f978093
acc0e3b
0431e33
6606654
 
acc0e3b
74f3b52
 
 
 
 
c3d1a8e
07cf345
d729da4
6606654
07cf345
a6d305c
2a93439
c3d1a8e
acc0e3b
9ed340e
a6d305c
 
74f3b52
260e2b7
536372f
8b8df4a
 
536372f
 
8b8df4a
 
 
0e05b66
5ea3859
8b8df4a
2a93439
3344bc7
8b8df4a
0e05b66
8b8df4a
 
 
 
 
 
36639c5
8b8df4a
0e05b66
 
 
8b8df4a
0e05b66
536372f
66e99a9
 
 
 
 
 
 
 
 
 
7e521eb
 
 
 
 
 
 
 
 
 
 
 
 
 
536372f
7e521eb
536372f
 
61b24cf
536372f
 
 
 
 
 
 
 
 
 
ed12800
 
 
 
 
c7af9e1
 
ed12800
 
 
 
 
 
 
 
536372f
a9f5c93
c7af9e1
cb31dc6
a18637e
c7af9e1
cb31dc6
e86985c
 
 
 
 
 
 
 
a9f5c93
 
 
 
 
 
 
 
 
 
 
 
f5f2b08
cb31dc6
c7af9e1
cb31dc6
 
b8bed0e
59f0172
b8bed0e
f5f2b08
 
 
 
 
cb31dc6
f5f2b08
cb31dc6
f5f2b08
cb31dc6
 
 
 
 
 
 
 
f5f2b08
 
 
 
 
 
 
0136c25
 
f5f2b08
0136c25
 
 
f5f2b08
 
a18637e
 
 
cb31dc6
c7af9e1
 
 
 
 
9ef9a98
 
 
 
4d1254b
 
9ef9a98
4d1254b
9ef9a98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d1254b
 
9ef9a98
 
4d1254b
 
 
 
 
 
 
9ef9a98
4d1254b
9ef9a98
4d1254b
 
9ef9a98
 
 
 
 
 
 
 
 
8b8df4a
0e05b66
1ef25fd
66e99a9
1ef25fd
 
66e99a9
4a16255
66e99a9
8b8df4a
 
536372f
66e99a9
 
 
 
4a16255
7d39f23
1a1b0fb
 
 
 
 
3f8d802
 
66e99a9
c7af9e1
66e99a9
4a16255
8b8df4a
ed12800
7e521eb
0e05b66
8b8df4a
7e521eb
 
 
 
 
 
 
66e99a9
3f8d802
 
66e99a9
 
3f8d802
66e99a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a16255
66e99a9
4a16255
8b8df4a
b64c6e8
ed12800
8b8df4a
 
a9f5c93
abd8ad5
8b8df4a
a9f5c93
645e61e
3f8d802
4a16255
8b8df4a
645e61e
31952a4
ee80e0c
8b8df4a
abd8ad5
9ef9a98
8b8df4a
ed12800
8b8df4a
66e99a9
4a16255
8b8df4a
 
 
66e99a9
c7af9e1
33d1fed
e351151
0e05b66
7bfd1fb
33d1fed
b669cb7
33d1fed
 
b669cb7
8b8df4a
 
33d1fed
7bfd1fb
8b8df4a
 
 
 
 
33d1fed
7bfd1fb
d729da4
190c324
66e99a9
33d1fed
8b8df4a
33d1fed
 
 
9a77a66
 
 
 
66e99a9
9a77a66
66e99a9
9a77a66
7a80c9e
0e05b66


import gradio as gr
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
import io
import tempfile
#import pyvo as vo
import pandas as pd
from pinecone import Pinecone
import logging
import re

from utils.ads_references import extract_keywords_with_gpt, fetch_nasa_ads_references 
from utils.data_insights import fetch_exoplanet_data, generate_data_insights


from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
evaluator_llm = LangchainLLMWrapper(llm)
from ragas.metrics import LLMContextRecall, ContextRelevance, Faithfulness, ResponseRelevancy, FactualCorrectness

# Load the NASA-specific bi-encoder model and tokenizer
bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
bi_model = AutoModel.from_pretrained(bi_encoder_model_name)

# Set up OpenAI client
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

# Pinecone setup
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)
index_name = "scdd-index"
index = pc.Index(index_name)

# Define system message with instructions
system_message = """
You are ExosAI, an advanced assistant specializing in Exoplanet and Astrophysics research.

Generate a **detailed and structured** response based on the given **retrieved context and user input**, incorporating key **observables, physical parameters, and technical requirements**. Organize the response into the following sections:

1. **Science Objectives**: Define key scientific objectives related to the science context and user input.
2. **Physical Parameters**: Outline the relevant physical parameters (e.g., mass, temperature, composition).
3. **Observables**: Specify the key observables required to study the science context.
4. **Description of Desired Observations**: Detail the observational techniques, instruments, or approaches necessary to gather relevant data.
5. **Observations Requirements Table**: Generate a table relevant to the Science Objectives, Physical Parameters, Observables and Description of Desired Observations with the following columns and at least 7 rows:
    - Wavelength Band: Should only be UV, Visible and Infrared).
    - Instrument: Should only be Imager, Spectrograph, Polarimeter and Coronagraph).
    - Necessary Values: The necessary values or parameters (wavelength range, spectral resolution where applicable, spatial resolution where applicable, contrast ratio where applicable).
    - Desired Values: The desired values or parameters (wavelength range, spectral resolution where applicable, spatial resolution where applicable).
    - Number of Objects Observed: Estimate the number of objects that need to be observed for a statistically meaningful result or for fulfilling the science objective.
    - Justification: Detailed scientific explanation of why these observations are important for the science objectives.
    - Comments: Additional notes or remarks regarding each observation.

#### **Table Format** 

| Wavelength Band      | Instrument                         | Necessary Values                   | Desired Values                  | Number of Objects Observed      | Justification     | Comments |
|----------------------|------------------------------------|------------------------------------|---------------------------------|---------------------------------|-------------------|----------|

#### **Guiding Constraints (Exclusions & Prioritization)**
- **Wavelength Band Restriction:** Only include **UV, Visible, and Infrared** bands.
- **Instrument Restriction:** Only include **Imager, Spectrograph, Polarimeter, and Coronagraph**.
- **Wavelength Limits:** Prioritize wavelengths between **100 nanometers (nm) and 3 micrometers (μm)**.
- **Allowed Instruments:** **Only include** observations from **direct imaging, spectroscopy, and polarimetry.** **Exclude** transit and radial velocity methods.
- **Exclusion of Existing Facilities:** **Do not reference** existing observatories such as JWST, Hubble, or ground-based telescopes. This work pertains to a **new mission**.
- **Spectral Resolution Constraint:** Limit spectral resolution (**R**) to the range **10,000 – 50,000**.
- **Contrast Ratio:** come up with an appropriate contrast ratio depending on the requirements **.
- **Estimate the "Number of Objects Observed" based on the observational strategy, parameters, instruments, statistical requirements, and feasibility.**
- **Ensure that all parameters remain scientifically consistent.**
- **Include inline references wherever available**. Especially in the Justification column.
- **Pay attention to the retrieved context**.

**Use this table format as a guideline, generate a detailed table dynamically based on the input.**. Ensure that all values align with the provided constraints and instructions.

**Include inline references wherever available**. Especially in the Justification column.

Ensure the response is **structured, clear, and observation requirements table follows this format**. **All included parameters must be scientifically consistent with each other.**
"""

# Function to encode query text
def encode_query(text):
    inputs = bi_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = bi_model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()
    embedding /= np.linalg.norm(embedding)
    return embedding.tolist()


# Context retrieval function using Pinecone
def retrieve_relevant_context(user_input, context_text, science_objectives="", top_k=3):
    query_text = f"Science Goal: {user_input}\nContext: {context_text}\nScience Objectives: {science_objectives}" if science_objectives else f"Science Goal: {user_input}\nContext: {context_text}"
    query_embedding = encode_query(query_text)

    # Pinecone query
    query_response = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    retrieved_context = "\n\n".join([match['metadata']['text'] for match in query_response.matches])

    if not retrieved_context.strip():
        return "No relevant context found for the query."

    return retrieved_context

def clean_retrieved_context(raw_context):
    # Remove unnecessary line breaks within paragraphs
    cleaned = raw_context.replace("-\n", "").replace("\n", " ")

    # Remove extra spaces clearly
    cleaned = re.sub(r'\s+', ' ', cleaned)

    # Return explicitly cleaned context
    return cleaned.strip()

def generate_response(user_input, science_objectives="", relevant_context="", references=[], max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
    # Case 1: Both relevant context and science objectives are provided
    if relevant_context and science_objectives.strip():
        combined_input = f"Scientific Context: {relevant_context}\nUser Input: {user_input}\nScience Objectives (User Provided): {science_objectives}\n\nPlease generate only the remaining sections as per the defined format."
    
    # Case 2: Only relevant context is provided
    elif relevant_context:
        combined_input = f"Scientific Context: {relevant_context}\nUser Input: {user_input}\n\nPlease generate a full structured response, including Science Objectives."
    
    # Case 3: Neither context nor science objectives are provided
    elif science_objectives.strip():
        combined_input = f"User Input: {user_input}\nScience Objectives (User Provided): {science_objectives}\n\nPlease generate only the remaining sections as per the defined format."
    
    # Default: No relevant context or science objectives → Generate everything
    else:
        combined_input = f"User Input: {user_input}\n\nPlease generate a full structured response, including Science Objectives."
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": combined_input}
        ],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty
    )

    response_only = response.choices[0].message.content.strip()

    # ADS References appended separately
    references_text = ""
    if references:
        references_text = "\n\nADS References:\n" + "\n".join(
            [f"- {title} {authors} (Bibcode: {bibcode}) {pub} {pubdate}" 
             for title, abstract, authors, bibcode, pub, pubdate in references])

    # Full response (for Gradio display)
    full_response = response_only + references_text

    # Return two clearly separated responses
    return full_response, response_only

def export_to_word(response_content, subdomain_definition, science_goal, context, max_tokens, temperature, top_p, frequency_penalty, presence_penalty):
    doc = Document()
    
    # Add a title (optional, you can remove this if not needed)
    doc.add_heading('AI Generated SCDD', 0)

    # Insert the Subdomain Definition at the top
    doc.add_heading('Subdomain Definition:', level=1)
    doc.add_paragraph(subdomain_definition)

    # Insert the Science Goal at the top
    doc.add_heading('Science Goal:', level=1)
    doc.add_paragraph(science_goal)

    # Insert the User-defined Context
    doc.add_heading('User-defined Context:', level=1)
    doc.add_paragraph(context)

    # Insert Model Parameters
    doc.add_heading('Model Parameters:', level=1)
    doc.add_paragraph(f"Max Tokens: {max_tokens}")
    doc.add_paragraph(f"Temperature: {temperature}")
    doc.add_paragraph(f"Top-p: {top_p}")
    doc.add_paragraph(f"Frequency Penalty: {frequency_penalty}")
    doc.add_paragraph(f"Presence Penalty: {presence_penalty}")

    # Split the response into sections based on ### headings
    sections = response_content.split('### ')
    
    for section in sections:
        if section.strip():
            # Handle the "Observations Requirements Table" separately with proper formatting
            if 'Observations Requirements Table' in section:
                doc.add_heading('Observations Requirements Table', level=1)
                
                # Extract table lines
                table_lines = section.split('\n')[2:]  # Start after the heading line
                
                # Check if it's an actual table (split lines by '|' symbol)
                table_data = [line.split('|')[1:-1] for line in table_lines if '|' in line]
                
                if table_data:
                    # Add table to the document
                    table = doc.add_table(rows=len(table_data), cols=len(table_data[0]))
                    table.style = 'Table Grid'
                    for i, row in enumerate(table_data):
                        for j, cell_text in enumerate(row):
                            cell = table.cell(i, j)
                            cell.text = cell_text.strip()
                            # Apply text wrapping for each cell
                            cell._element.get_or_add_tcPr().append(parse_xml(r'<w:tcW w:w="2500" w:type="pct" ' + nsdecls('w') + '/>'))
                
                # Process any paragraphs that follow the table
                paragraph_after_table = '\n'.join([line for line in table_lines if '|' not in line and line.strip()])
                if paragraph_after_table:
                    doc.add_paragraph(paragraph_after_table.strip())
            
            # Handle the "ADS References" section
            elif section.startswith('ADS References'):
                doc.add_heading('ADS References', level=1)
                references = section.split('\n')[1:]  # Skip the heading
                for reference in references:
                    if reference.strip():
                        doc.add_paragraph(reference.strip())
            
            # Add all other sections as plain paragraphs
            else:
                doc.add_paragraph(section.strip())
    
    # Save the document to a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
    doc.save(temp_file.name)
    
    return temp_file.name

def extract_table_from_response(gpt_response):
    # Split the response into lines
    lines = gpt_response.strip().split("\n")
    
    # Find where the table starts and ends (based on the presence of pipes `|` and at least 3 columns)
    table_lines = [line for line in lines if '|' in line and len(line.split('|')) > 3]
    
    # If no table is found, return None or an empty string
    if not table_lines:
        return None
    
    # Find the first and last index of the table lines
    first_table_index = lines.index(table_lines[0])
    last_table_index = lines.index(table_lines[-1])
    
    # Extract only the table part
    table_text = lines[first_table_index:last_table_index + 1]
    
    return table_text

def gpt_response_to_dataframe(gpt_response):
    # Extract the table text from the GPT response
    table_lines = extract_table_from_response(gpt_response)
    
    # If no table found, return an empty DataFrame
    if table_lines is None or len(table_lines) == 0:
        return pd.DataFrame()

    # Find the header and row separator (assume it's a line with dashes like |---|)
    try:
        # The separator line (contains dashes separating headers and rows)
        sep_line_index = next(i for i, line in enumerate(table_lines) if set(line.strip()) == {'|', '-'})
    except StopIteration:
        # If no separator line is found, return an empty DataFrame
        return pd.DataFrame()

    # Extract headers (the line before the separator) and rows (lines after the separator)
    headers = [h.strip() for h in table_lines[sep_line_index - 1].split('|')[1:-1]]
    
    # Extract rows (each line after the separator)
    rows = [
        [cell.strip() for cell in row.split('|')[1:-1]]
        for row in table_lines[sep_line_index + 1:]
    ]

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    return df
    
def chatbot(user_input, science_objectives="", context="", subdomain="", max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):

    
    yield "🔄 Connecting with Pinecone...", None, None, None, None, None, None
    
    pc_index_name = "scdd-index"
    yield f"Using Pinecone index: **{index_name}**✅ ", None, None, None, None, None, None

    yield "🔎 Retrieving relevant context from Pinecone...", None, None, None, None, None, None
    # Retrieve relevant context using Pinecone
    relevant_context = retrieve_relevant_context(user_input, context, science_objectives)

    cleaned_context_list = [clean_retrieved_context(chunk) for chunk in relevant_context]
    

    yield "Context Retrieved successfully ✅ ", None, None, None, None, None, None, None

    keywords = extract_keywords_with_gpt(context, client)

    ads_query = " ".join(keywords)
    
    # Fetch NASA ADS references using the user context
    references = fetch_nasa_ads_references(ads_query)

    yield "ADS references retrieved... ✅ ", None, None, None, None, None, None, None
    

    yield "🔄 Generating structured response using GPT-4o...", None, None, None, None, None, None
    
    # Generate response from GPT-4
    full_response, response_only = generate_response(
        user_input=user_input,
        science_objectives=science_objectives,  
        relevant_context=relevant_context,
        references=references,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty
    )

    # RAGAS Evaluation
    
    context_ragas = cleaned_context_list
    response_ragas = response_only
    query_ragas = user_input + context
    reference_ragas = "\n\n".join([f"{title}\n{abstract}" for title, abstract, _, _, _, _ in references])

    dataset = []

    dataset.append(
        {
            "user_input":query_ragas,
            "retrieved_contexts":context_ragas,
            "response":response_ragas,
            "reference":reference_ragas
        }
    )

    evaluation_dataset = EvaluationDataset.from_list(dataset)

    ragas_evaluation = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), ContextRelevance(), Faithfulness(), ResponseRelevancy(), FactualCorrectness(coverage="high",atomicity="high")],llm=evaluator_llm, embeddings=embeddings)
    
    yield "Response generated successfully ✅ ", None, None, None, None, None, None
    
    # Append user-defined science objectives if provided
    if science_objectives.strip():
        full_response = f"### Science Objectives (User-Defined):\n\n{science_objectives}\n\n" + full_response

    # Export response to Word
    word_doc_path = export_to_word(
        full_response, subdomain, user_input, context, 
        max_tokens, temperature, top_p, frequency_penalty, presence_penalty
    )

    yield "Writing SCDD...Performing RAGAS Evaluation...", None, None, None, None, None, None
    
    # Fetch exoplanet data and generate insights
    exoplanet_data = fetch_exoplanet_data()
    data_insights_uq = generate_data_insights(user_input, client, exoplanet_data)

    # Extract GPT-generated table into DataFrame
    extracted_table_df = gpt_response_to_dataframe(full_response)

    # Combine response and insights
    full_response = f"{full_response}\n\nEnd of Response"

    yield "SCDD produced successfully ✅", None, None, None, None, None, None

    iframe_html = """<iframe width=\"768\" height=\"432\" src=\"https://miro.com/app/live-embed/uXjVKuVTcF8=/?moveToViewport=-331,-462,5434,3063&embedId=710273023721\" frameborder=\"0\" scrolling=\"no\" allow=\"fullscreen; clipboard-read; clipboard-write\" allowfullscreen></iframe>"""
    mapify_button_html = """<a href=\"https://mapify.so/app/new\" target=\"_blank\"><button>Create Mind Map on Mapify</button></a>"""

    yield full_response, relevant_context, ragas_evaluation, extracted_table_df, word_doc_path, iframe_html, mapify_button_html

with gr.Blocks() as demo:
    gr.Markdown("# **ExosAI - NASA SMD PCRAG SCDD Generator [version-2.1]**")

    gr.Markdown("## **User Inputs**")
    user_input = gr.Textbox(lines=5, placeholder="Enter your Science Goal...", label="Science Goal")
    context = gr.Textbox(lines=10, placeholder="Enter Context Text...", label="Additional Context")
    subdomain = gr.Textbox(lines=2, placeholder="Define your Subdomain...", label="Subdomain Definition")

    science_objectives_button = gr.Button("User-defined Science Objectives [Optional]")
    science_objectives_input = gr.Textbox(lines=5, placeholder="Enter Science Objectives...", label="Science Objectives", visible=False)
    science_objectives_button.click(lambda: gr.update(visible=True), outputs=[science_objectives_input])

    gr.Markdown("### **Model Parameters**")
    max_tokens = gr.Slider(50, 2000, 150, step=10, label="Max Tokens")
    temperature = gr.Slider(0.0, 1.0, 0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(0.0, 1.0, 0.9, step=0.1, label="Top-p")
    frequency_penalty = gr.Slider(0.0, 1.0, 0.5, step=0.1, label="Frequency Penalty")
    presence_penalty = gr.Slider(0.0, 1.0, 0.0, step=0.1, label="Presence Penalty")

    gr.Markdown("## **Model Outputs**")
    full_response = gr.Textbox(label="ExosAI SCDD Generation...")
    relevant_context = gr.Textbox(label="Retrieved Context...")
    ragas_evaluation = gr.Textbox(label="RAGAS Evaluation...")
    extracted_table_df = gr.Dataframe(label="SC Requirements Table")
    word_doc_path = gr.File(label="Download SCDD")
    iframe_html = gr.HTML(label="Miro")
    mapify_button_html = gr.HTML(label="Generate Mind Map on Mapify")

    with gr.Row():
        submit_button = gr.Button("Generate SCDD")
        clear_button = gr.Button("Reset")

    submit_button.click(chatbot, inputs=[user_input, science_objectives_input, context, subdomain, max_tokens, temperature, top_p, frequency_penalty, presence_penalty], outputs=[full_response, relevant_context, ragas_evaluation, extracted_table_df, word_doc_path, iframe_html, mapify_button_html],queue=True)

    clear_button.click(lambda: ("", "", "", "", 150, 0.7, 0.9, 0.5, 0.0, "", "", None, None, None, None, None), outputs=[user_input, science_objectives_input, context, subdomain, max_tokens, temperature, top_p, frequency_penalty, presence_penalty, full_response, relevant_context, ragas_evaluation, extracted_table_df, word_doc_path, iframe_html, mapify_button_html])

demo.launch(share=True)