Spaces:

KLM-Solutions
/

Gradient-cyber

Sleeping

App Files Files Community

subashree commited on Sep 12, 2024

Commit

824324e

verified ·

1 Parent(s): 300c4ec

Update app.py

Browse files

Files changed (1) hide show

app.py +300 -1

app.py CHANGED Viewed

@@ -1,2 +1,301 @@
 import os
-exec(os.environ.get('app'))

+import streamlit as st
+import pandas as pd
+from openai import OpenAI
+import requests
+import certifi
+import tiktoken
+from tiktoken import get_encoding
+from pinecone import Pinecone, ServerlessSpec
+import time
+# Configuration
 import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
+# Configuration
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+PINECONE_API_KEY = "9c097a58-6008-409a-859a-668a002320f6"
+INDEX_NAME = "gradient-cyber"
+BATCH_SIZE = 100
+MAX_RESULTS = 1000
+# Initialize OpenAI
+client = OpenAI(api_key=OPENAI_API_KEY)
+# Initialize Pinecone
+pc = Pinecone(api_key=PINECONE_API_KEY)
+# Check if the index already exists before creating it
+if INDEX_NAME not in pc.list_indexes().names():
+    pc.create_index(
+        name=INDEX_NAME,
+        dimension=1536,
+        metric='cosine',
+        spec=ServerlessSpec(cloud='aws', region='us-east-1')
+    )
+index = pc.Index(INDEX_NAME)
+# Define helper functions
+def truncate_text(text, max_tokens):
+    tokenizer = get_encoding("gpt2")
+    tokens = tokenizer.encode(text)
+    return tokenizer.decode(tokens[:max_tokens])
+def generate_embedding(text):
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            response = client.embeddings.create(
+                model="text-embedding-ada-002",
+                input=text
+            )
+            return response.data[0].embedding, response.usage.total_tokens
+        except Exception as e:
+            if attempt == max_retries - 1:
+                st.error(f"Error creating embedding after {max_retries} attempts: {str(e)}")
+                return None, 0
+            time.sleep(2 ** attempt)  # Exponential backoff
+def upsert_in_batches(index, vectors, batch_size=100):
+    batches = [vectors[i:i + batch_size] for i in range(0, len(vectors), batch_size)]
+    for batch in batches:
+        try:
+            index.upsert(vectors=batch, namespace="ns1")
+        except Exception as e:
+            st.error(f"Error upserting batch: {e}")
+def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+def semantic_similarity(text1, text2):
+    embedding1, _ = generate_embedding(text1)
+    embedding2, _ = generate_embedding(text2)
+    if embedding1 is None or embedding2 is None:
+        return 0
+    return sum(a*b for a, b in zip(embedding1, embedding2))
+def expand_query(original_query):
+    try:
+        expansion_prompt = f"Expand the following query into 3-5 related questions or terms: '{original_query}'"
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": expansion_prompt}],
+            max_tokens=100,
+            temperature=0.7
+        )
+        return original_query + " " + response.choices[0].message.content
+    except Exception as e:
+        st.error(f"Error in query expansion: {str(e)}")
+        return original_query  # Return original query if expansion fails
+def truncate_context(context, max_tokens=14000):
+    encoding = tiktoken.get_encoding("cl100k_base")
+    encoded = encoding.encode(context)
+    truncated = encoded[:max_tokens]
+    return encoding.decode(truncated)
+# Streamlit UI for file upload
+st.title("Gradient-cyber")
+uploaded_file = st.file_uploader("Upload an Excel file", type=["xlsx"])
+if uploaded_file is not None:
+    st.write("File uploaded successfully!")
+    # Load Excel file
+    df = pd.read_excel(uploaded_file)
+    st.write("Excel file loaded:")
+    st.write(df.head())
+    # Concatenate text from all columns for each row into a readable sentence
+    def create_meaningful_sentence(row):
+        return '. '.join([f"{col.replace('_', ' ')}: {row[col]}" for col in df.columns])
+    df['combined_text'] = df.apply(create_meaningful_sentence, axis=1)
+    st.write("Columns concatenated into meaningful sentences:")
+    st.write(df[['combined_text']].head())
+    vectors = []
+    # Process each row in the DataFrame
+    total_tokens_used = 0
+    total_requests = 0
+    for i, row in df.iterrows():
+        text = row['combined_text']
+        # Truncate text to fit within the model's maximum context length
+        text = truncate_text(text, max_tokens=8192)
+        embedding, tokens_used = generate_embedding(text)
+        if embedding is not None:
+            total_tokens_used += tokens_used
+            total_requests += 1
+            # Truncate text fields to reduce metadata size
+            def truncate_field(field, max_length=500):
+                return str(field)[:max_length] if not pd.isna(field) else ''
+            # Prepare metadata with handling NaN values and converting to string
+            metadata = {
+                "ID": truncate_field(row['ID']),
+                "eventDtgTime": truncate_field(row['eventDtgTime']),
+                "alerts": truncate_field(row.get('alerts', '')),
+                "displayTitle": truncate_field(row['displayTitle']),
+                "instantAnalytics": truncate_field(row.get('instantAnalytics', '')),
+                "detailedText": truncate_field(row.get('detailedText', '')),
+                "msgPrecs": truncate_field(row.get('msgPrecs', '')),
+                "unit": truncate_field(row.get('unit', '')),
+                "size": truncate_field(row.get('size', '')),
+                "embedHtml": truncate_field(row.get('embedHtml', '')),
+                "dataSources": truncate_field(row.get('dataSources', '')),
+                "snippetText": truncate_field(row.get('snippetText', '')),
+                "contentLink": truncate_field(row.get('contentLink', '')),
+                "description": truncate_field(row.get('description', '')),
+                "imageDescription": truncate_field(row.get('imageDescription', '')),
+                "reportSummary": truncate_field(row.get('reportSummary', '')),
+                "authorName": truncate_field(row.get('authorName', '')),
+                "timeReportCompleted": truncate_field(row.get('timeReportCompleted', '')),
+                "attachment": truncate_field(row.get('attachment', '')),
+                "latitude": truncate_field(row.get('latitude', '')),
+                "securityLevels": truncate_field(row.get('securityLevels', '')),
+                "imagereSourceLink": truncate_field(row.get('imagereSourceLink', '')),
+                "eventDtg": truncate_field(row.get('eventDtg', '')),
+                "status": truncate_field(row.get('status', '')),
+                "users": truncate_field(row.get('users', '')),
+                "name": truncate_field(row.get('name', '')),
+                "sessions": truncate_field(row.get('sessions', '')),
+                "fiscalStatus": truncate_field(row.get('fiscalStatus', '')),
+                "sentimentSummary": truncate_field(row.get('sentimentSummary', '')),
+                "sourceOrg": truncate_field(row.get('sourceOrg', '')),
+                "dateCreated": truncate_field(row.get('dateCreated', '')),
+                "active": truncate_field(row.get('active', '')),
+                "responseSummary": truncate_field(row.get('responseSummary', '')),
+                "comparisonCommunitiesCountries": truncate_field(row.get('comparisonCommunitiesCountries', '')),
+                "activity": truncate_field(row.get('activity', '')),
+                "applications": truncate_field(row.get('applications', '')),
+                "url": truncate_field(row.get('url', '')),
+                "timeZones": truncate_field(row.get('timeZones', '')),
+                "location": truncate_field(row.get('location', '')),
+                "longitude": truncate_field(row.get('longitude', '')),
+                "dateModified": truncate_field(row.get('dateModified', '')),
+                "pedigrees": truncate_field(row.get('pedigrees', '')),
+                "gistComment": truncate_field(row.get('gistComment', '')),
+                "tag": truncate_field(row.get('tag', '')),
+                "geoCode": truncate_field(row.get('geoCode', '')),
+                "time": truncate_field(row.get('time', '')),
+                "timeReportRouted": truncate_field(row.get('timeReportRouted', '')),
+                "rteToOrg": truncate_field(row.get('rteToOrg', '')),
+                "copyReportToOrg": truncate_field(row.get('copyReportToOrg', '')),
+                "sourceOrganization": truncate_field(row.get('sourceOrganization', '')),
+                "coordinates": truncate_field(row.get('coordinates', '')),
+                "image1": truncate_field(row.get('image1', '')),
+                "image2": truncate_field(row.get('image2', '')),
+                "image3": truncate_field(row.get('image3', '')),
+                "image4": truncate_field(row.get('image4', '')),
+                "image5": truncate_field(row.get('image5', '')),
+                "numEmailsSent": truncate_field(row.get('numEmailsSent', '')),
+                "lastEmailDate": truncate_field(row.get('lastEmailDate', '')),
+                "reportDtg": truncate_field(row.get('reportDtg', '')),
+                "metadata": truncate_field(row.get('metadata', '')),
+                "eventOrganizations": truncate_field(row.get('eventOrganizations', '')),
+                "classification": truncate_field(row.get('classification', '')),
+                "assetIPs": truncate_field(row.get('assetIPs', '')),
+                "sitrepTemplate": truncate_field(row.get('sitrepTemplate', '')),
+                "industry": truncate_field(row.get('industry', '')),
+                "networkSegmentList": truncate_field(row.get('networkSegmentList', '')),
+                "approvedDate": truncate_field(row.get('approvedDate', '')),
+                "incident": truncate_field(row.get('incident', '')),
+                "sendEmail": truncate_field(row.get('sendEmail', '')),
+                "newFormat": truncate_field(row.get('newFormat', '')),
+                "duMapping": truncate_field(row.get('duMapping', '')),
+                "jsonTag": truncate_field(row.get('jsonTag', '')),
+                "createdFrom": truncate_field(row.get('createdFrom', '')),
+                "integrationData": truncate_field(row.get('integrationData', '')),
+                "mtti": truncate_field(row.get('mtti', '')),
+                "mttd": truncate_field(row.get('mttd', '')),
+                "mttr": truncate_field(row.get('mttr', '')),
+                "oldEventDate": truncate_field(row.get('oldEventDate', '')),
+                "org_event_name": truncate_field(row.get('org_event_name', '')),
+                "combined_text": text  # Add combined text to metadata
+            }
+            vectors.append({'id': str(row['ID']), 'values': embedding, 'metadata': metadata})
+    if vectors:
+        upsert_in_batches(index, vectors, BATCH_SIZE)
+        st.success(f"Data successfully uploaded to Pinecone.")
+        st.info(f"Total tokens used: {total_tokens_used}")
+        st.info(f"Total requests made: {total_requests}")
+    else:
+        st.warning("No embeddings were generated.")
+# Query input and response
+query = st.text_input("Enter your query:")
+if query:
+    try:
+        expanded_query = expand_query(query)
+        query_embedding, _ = generate_embedding(expanded_query)
+        if query_embedding is not None:
+            # Perform the search in Pinecone
+            results = index.query(
+                namespace="ns1",
+                vector=query_embedding,
+                top_k=50,
+                include_metadata=True
+            )
+            # Semantic filtering
+            filtered_results = sorted(
+                results['matches'],
+                key=lambda x: semantic_similarity(query, x['metadata']['combined_text']),
+                reverse=True
+            )[:10]  # Reduced from 20 to 15 to further limit context size
+            # Prepare context for GPT
+            context = "\n".join([
+                f"ID: {match['id']}\n" +
+                f"Event Date/Time: {match['metadata'].get('eventDtgTime', 'N/A')}\n" +
+                f"Display Title: {match['metadata'].get('displayTitle', 'N/A')}\n" +
+                f"Status: {match['metadata'].get('status', 'N/A')}\n" +
+                f"Combined Text: {match['metadata'].get('combined_text', 'N/A')}\n" +
+                "---"
+                for match in filtered_results
+            ])
+            # Truncate the context
+            truncated_context = truncate_context(context)
+            # Prepare the prompt for GPT
+            system_prompt = """Core Capabilities: Expert Knowledge on SITREPs: Understand and explain SITREP components "
+                         "like threat analysis, incident summaries, and risk assessments. Flexible Query Handling: "
+                         "Interpret and respond to diverse queries, supporting access to data by date, theme, severity, etc. "
+                         "Data Retrieval: Provide specific information such as incident dates, threat actors, and mitigation strategies. "
+                         "Offer summaries or detailed reports based on user needs. Analytical Engagement: Engage in discussions, offering insights and hypotheses. "
+                         "Support user analysis of cyber threats and incidents. Interaction Guidelines: Understanding Queries: "
+                         "Use NLP to interpret user queries and ask clarifying questions if needed. Providing Context: Give context to help users understand the relevance and implications of information. "
+                         "Customizing Detail: Adjust detail levels based on user preferences, providing summaries or deep dives. "
+                         "Allow users to specify the format of information (e.g., bullet points, detailed paragraphs, tables). "
+                         "Encouraging Exploration: Suggest related queries and additional information. Provide thorough explanations to support learning and decision-making."""
+            user_prompt = f"""Query: {query}
+Relevant Information:
+{truncated_context}
+Provide a clear, concise, and comprehensive answer. Synthesize information from multiple entries if necessary. Cite specific details and examples when applicable. If information is missing, state what is known and what remains uncertain."""
+            response = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                max_tokens=1000,
+                temperature=0.7
+            )
+            answer = response.choices[0].message.content
+            st.write("Answer to your query:")
+            st.write(answer)
+        else:
+            st.error("Failed to generate query embedding.")
+    except Exception as e:
+        st.error(f"Error processing query: {str(e)}")