Spaces:

SohaAyub
/

genaipaper

Sleeping

App Files Files Community

SohaAyub commited on Feb 13

Commit

ea62d58

verified ·

1 Parent(s): 479da54

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -55

app.py CHANGED Viewed

@@ -8,52 +8,43 @@ from sentence_transformers import SentenceTransformer
 from groq import Groq
 from faster_whisper import WhisperModel
 import os
-import logging
-logging.basicConfig(level=logging.INFO)
 # =========================
 # INITIALIZE MODELS
 # =========================
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 whisper_model = WhisperModel("base", compute_type="int8")
-# Groq API client
 groq_api_key = os.environ.get("GROQ_API_KEY")
-if not groq_api_key:
-    raise ValueError("GROQ_API_KEY environment variable not set!")
-client = Groq(api_key=groq_api_key)
-MODEL_NAME = "llama-3.3-70b-versatile"  # Use exactly this model
 # Global storage
 sections = {}
 section_texts = []
 index = None
 # =========================
-# ARXIV PDF FUNCTIONS
 # =========================
-def is_valid_arxiv_id(arxiv_id):
-    pattern = r"^\d{4}\.\d{4,5}$"
-    return re.match(pattern, arxiv_id)
 def download_arxiv_pdf(arxiv_id):
     try:
         url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
-        response = requests.get(url, timeout=10)
-        if response.status_code != 200:
-            url = f"https://arxiv.org/e-print/{arxiv_id}"
-            response = requests.get(url, timeout=10)
         response.raise_for_status()
         file_path = f"{arxiv_id}.pdf"
         with open(file_path, "wb") as f:
             f.write(response.content)
         return file_path
-    except Exception as e:
-        logging.error(f"Failed to download PDF for {arxiv_id}: {e}")
         return None
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
@@ -61,12 +52,14 @@ def extract_text_from_pdf(pdf_path):
         text += page.get_text()
     return text
 def extract_sections(text):
     patterns = [
-        r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)",
-        r"\n(\d+\.\s+[A-Z][^\n]+)",
-        r"\n(\d+\s+[A-Z][^\n]+)",
-        r"\n([A-Z][A-Z\s]{3,})\n"
     ]
     matches = []
@@ -74,51 +67,66 @@ def extract_sections(text):
         matches.extend(list(re.finditer(pattern, text)))
     matches = sorted(matches, key=lambda x: x.start())
     sections = {}
     for i, match in enumerate(matches):
         title = match.group(1).strip()
         start = match.end()
         end = matches[i+1].start() if i+1 < len(matches) else len(text)
         sections[title] = text[start:end].strip()
     return sections
 # =========================
 # VECTOR STORE
 # =========================
 def build_vector_store(sections_dict):
     global index, section_texts
     section_texts = list(sections_dict.values())
     if len(section_texts) == 0:
         index = None
         return
     embeddings = embedding_model.encode(section_texts)
     embeddings = np.array(embeddings).astype("float32")
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
 # =========================
 # LOAD PAPER
 # =========================
 def load_paper(arxiv_id):
     global sections, index
-    arxiv_id = arxiv_id.strip()
-    if not is_valid_arxiv_id(arxiv_id):
-        return gr.update(choices=[]), "❌ Invalid arXiv ID format"
     pdf_path = download_arxiv_pdf(arxiv_id)
     if pdf_path is None:
-        return gr.update(choices=[]), "❌ Could not download PDF"
     text = extract_text_from_pdf(pdf_path)
     sections = extract_sections(text)
     build_vector_store(sections)
     return gr.update(choices=list(sections.keys())), "✅ Paper Loaded Successfully"
 # =========================
 # SUMMARIZATION
 # =========================
 def summarize_section(section_title):
     if section_title not in sections:
         return "Please load paper first."
     content = sections[section_title]
     prompt = f"""
@@ -132,36 +140,35 @@ Generate a structured scientific summary:
 Section Title: {section_title}
 Section Content:
-{content[:2500]}  # truncate to avoid exceeding model context
 """
-    try:
-        response = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.3
-        )
-        answer = response.choices[0].message.content
-    except Exception as e:
-        logging.error("❌ Summarization failed", exc_info=True)
-        answer = f"Error generating summary: {e}"
-    return answer
 # =========================
 # RAG CHAT
 # =========================
 def rag_chat(message, history):
     global index
     if index is None:
         history.append((message, "Please load a paper first."))
-        return history, gr.Textbox.update(value="")
     query_embedding = embedding_model.encode([message])
     query_embedding = np.array(query_embedding).astype("float32")
-    D, I = index.search(query_embedding, k=min(3, len(section_texts)))
-    retrieved = "\n\n".join([section_texts[i] for i in I[0] if i != -1])
     prompt = f"""
 Answer strictly using the provided research paper context.
@@ -174,33 +181,36 @@ Context:
 Question:
 {message}
 """
-    try:
-        response = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.2
-        )
-        answer = response.choices[0].message.content
-    except Exception as e:
-        logging.error("❌ RAG chat failed", exc_info=True)
-        answer = f"Error generating answer: {e}"
     history.append((message, answer))
-    return history, gr.Textbox.update(value="")
 # =========================
 # VOICE CHAT
 # =========================
 def voice_chat(audio, history):
     if audio is None:
-        return history, gr.Textbox.update(value="")
     segments, _ = whisper_model.transcribe(audio)
     text = "".join([segment.text for segment in segments])
     return rag_chat(text, history)
 # =========================
 # GRADIO UI
 # =========================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📚 ArXiv RAG Research Assistant")
@@ -209,6 +219,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         load_button = gr.Button("Load Paper")
     load_status = gr.Markdown()
     section_dropdown = gr.Dropdown(label="Select Section")
     summarize_button = gr.Button("Generate Summary")
     summary_output = gr.Markdown()
@@ -228,4 +239,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     send.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
     voice_button.click(voice_chat, inputs=[audio_input, chatbot], outputs=[chatbot, msg])
-demo.launch(debug=True)

 from groq import Groq
 from faster_whisper import WhisperModel
 import os
 # =========================
 # INITIALIZE MODELS
 # =========================
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 whisper_model = WhisperModel("base", compute_type="int8")
+# Retrieve Groq API key from environment variables
 groq_api_key = os.environ.get("GROQ_API_KEY")
+MODEL_NAME = "llama-3.3-70b-versatile"
 # Global storage
 sections = {}
 section_texts = []
 index = None
 # =========================
+# PDF FUNCTIONS
 # =========================
 def download_arxiv_pdf(arxiv_id):
     try:
         url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+        response = requests.get(url)
         response.raise_for_status()
         file_path = f"{arxiv_id}.pdf"
         with open(file_path, "wb") as f:
             f.write(response.content)
         return file_path
+    except:
         return None
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
         text += page.get_text()
     return text
 def extract_sections(text):
     patterns = [
+        r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)",        # Roman numeral ALL CAPS
+        r"\n(\d+\.\s+[A-Z][^\n]+)",             # 1. Introduction
+        r"\n(\d+\s+[A-Z][^\n]+)",               # 1 Introduction
+        r"\n([A-Z][A-Z\s]{3,})\n"               # ALL CAPS standalone
     ]
     matches = []
         matches.extend(list(re.finditer(pattern, text)))
     matches = sorted(matches, key=lambda x: x.start())
     sections = {}
     for i, match in enumerate(matches):
         title = match.group(1).strip()
         start = match.end()
         end = matches[i+1].start() if i+1 < len(matches) else len(text)
         sections[title] = text[start:end].strip()
     return sections
 # =========================
 # VECTOR STORE
 # =========================
 def build_vector_store(sections_dict):
     global index, section_texts
     section_texts = list(sections_dict.values())
     if len(section_texts) == 0:
         index = None
         return
     embeddings = embedding_model.encode(section_texts)
     embeddings = np.array(embeddings).astype("float32")
     dimension = embeddings.shape[1]
     index = faiss.IndexFlatL2(dimension)
     index.add(embeddings)
 # =========================
 # LOAD PAPER
 # =========================
 def load_paper(arxiv_id):
     global sections, index
     pdf_path = download_arxiv_pdf(arxiv_id)
     if pdf_path is None:
+        return gr.update(choices=[]), "❌ Invalid arXiv ID"
     text = extract_text_from_pdf(pdf_path)
     sections = extract_sections(text)
     build_vector_store(sections)
     return gr.update(choices=list(sections.keys())), "✅ Paper Loaded Successfully"
 # =========================
 # SUMMARIZATION
 # =========================
 def summarize_section(section_title):
     if section_title not in sections:
         return "Please load paper first."
     content = sections[section_title]
     prompt = f"""
 Section Title: {section_title}
 Section Content:
+{content[:6000]}
 """
+    response = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.3
+    )
+    return response.choices[0].message.content
 # =========================
 # RAG CHAT
 # =========================
 def rag_chat(message, history):
     global index
     if index is None:
         history.append((message, "Please load a paper first."))
+        return history, ""
     query_embedding = embedding_model.encode([message])
     query_embedding = np.array(query_embedding).astype("float32")
+    D, I = index.search(query_embedding, k=3)
+    retrieved = "\n\n".join([section_texts[i] for i in I[0]])
     prompt = f"""
 Answer strictly using the provided research paper context.
 Question:
 {message}
 """
+    response = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.2
+    )
+    answer = response.choices[0].message.content
     history.append((message, answer))
+    return history, ""
 # =========================
 # VOICE CHAT
 # =========================
 def voice_chat(audio, history):
     if audio is None:
+        return history, ""
     segments, _ = whisper_model.transcribe(audio)
     text = "".join([segment.text for segment in segments])
     return rag_chat(text, history)
 # =========================
 # GRADIO UI
 # =========================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📚 ArXiv RAG Research Assistant")
         load_button = gr.Button("Load Paper")
     load_status = gr.Markdown()
     section_dropdown = gr.Dropdown(label="Select Section")
     summarize_button = gr.Button("Generate Summary")
     summary_output = gr.Markdown()
     send.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
     voice_button.click(voice_chat, inputs=[audio_input, chatbot], outputs=[chatbot, msg])
+demo.launch(debug=True)