PraneshJs commited on
Commit
b9af798
·
verified ·
1 Parent(s): 4543e5a

updated to understand and reply about the paper uploaded

Browse files
Files changed (1) hide show
  1. app.py +45 -17
app.py CHANGED
@@ -11,13 +11,12 @@ from langchain_huggingface import HuggingFaceEmbeddings
11
  from threading import Thread
12
  from dotenv import load_dotenv
13
 
14
-
15
  load_dotenv()
16
 
17
  # === CONFIG ===
18
  STORAGE_DIR = "storage"
19
  CLEANUP_INTERVAL = 600 # 10 min
20
- SESSION_TTL = 1800 # 30 min
21
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
22
  OPENROUTER_MODEL = "z-ai/glm-4.5-air:free"
23
 
@@ -42,19 +41,33 @@ def process_pdf(pdf_file):
42
  return "No file uploaded.", "", []
43
  session_id = str(uuid.uuid4())
44
  reader = PdfReader(pdf_file.name)
 
 
45
  text = "".join([page.extract_text() for page in reader.pages if page.extract_text()])
46
 
 
 
 
 
 
 
47
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
48
  chunks = splitter.split_text(text)
49
 
 
50
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
51
  session_path = os.path.join(STORAGE_DIR, session_id)
52
  os.makedirs(session_path, exist_ok=True)
53
-
54
  db = FAISS.from_texts(chunks, embeddings)
55
  db.save_local(session_path)
56
 
57
- chat_history = [("System", "Paper uploaded and processed. You can now ask questions.")]
 
 
 
 
 
 
58
  return f"Paper uploaded successfully. Session ID: {session_id}", session_id, chat_history
59
 
60
  # === QUERY FUNCTION ===
@@ -68,28 +81,46 @@ def query_paper(session_id, user_message, chat_history):
68
  return chat_history, ""
69
 
70
  session_path = os.path.join(STORAGE_DIR, session_id)
 
 
 
 
 
 
 
 
 
 
 
71
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
72
  db = FAISS.load_local(session_path, embeddings, allow_dangerous_deserialization=True)
73
  retriever = db.as_retriever(search_kwargs={"k": 3})
74
 
75
- # Use invoke() method
76
  docs = retriever.invoke(user_message)
77
  context = "\n\n".join([d.page_content for d in docs])
78
 
 
79
  prompt = f"""
80
- You are an AI assistant. Explain the following research paper content in simple terms and answer the question.
81
- Context from paper:
 
 
 
 
 
 
82
  {context}
83
 
84
- Question: {user_message}
85
- Answer:
86
  """
87
 
88
  headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
89
  payload = {
90
  "model": OPENROUTER_MODEL,
91
  "messages": [
92
- {"role": "system", "content": "You are a helpful research explainer."},
93
  {"role": "user", "content": prompt}
94
  ]
95
  }
@@ -105,7 +136,6 @@ Answer:
105
  except Exception as e:
106
  answer = f"Error: {str(e)}"
107
 
108
- # Update chat history
109
  chat_history = chat_history or []
110
  chat_history.append((user_message, answer))
111
 
@@ -119,7 +149,7 @@ with gr.Blocks() as demo:
119
  pdf_input = gr.File(label="Upload Research Paper (PDF)", file_types=[".pdf"])
120
  session_box = gr.Textbox(label="Session ID", interactive=False)
121
 
122
- chatbot = gr.Chatbot(label="Chat about your paper", height=400)
123
  user_message = gr.Textbox(label="Ask a question", placeholder="What is this paper about?")
124
 
125
  with gr.Row():
@@ -131,7 +161,7 @@ with gr.Blocks() as demo:
131
  state_chat = gr.State([])
132
  state_session = gr.State("")
133
 
134
- # Upload button functionality
135
  def handle_upload(pdf_file):
136
  status, session_id, chat_history = process_pdf(pdf_file)
137
  return status, session_id, chat_history
@@ -142,7 +172,7 @@ with gr.Blocks() as demo:
142
  outputs=[session_box, state_session, state_chat]
143
  )
144
 
145
- # Ask button functionality
146
  def handle_question(session_id, message, chat_history):
147
  updated_chat, _ = query_paper(session_id, message, chat_history)
148
  return updated_chat, ""
@@ -157,7 +187,6 @@ with gr.Blocks() as demo:
157
  outputs=[state_chat]
158
  )
159
 
160
- # Submit on enter
161
  user_message.submit(
162
  fn=handle_question,
163
  inputs=[state_session, user_message, state_chat],
@@ -168,7 +197,7 @@ with gr.Blocks() as demo:
168
  outputs=[state_chat]
169
  )
170
 
171
- # Clear chat
172
  def clear_chat():
173
  return [], []
174
 
@@ -177,7 +206,6 @@ with gr.Blocks() as demo:
177
  outputs=[chatbot, state_chat]
178
  )
179
 
180
- # Update chatbot display when chat history changes
181
  state_chat.change(
182
  lambda chat: chat,
183
  inputs=[state_chat],
 
11
  from threading import Thread
12
  from dotenv import load_dotenv
13
 
 
14
  load_dotenv()
15
 
16
  # === CONFIG ===
17
  STORAGE_DIR = "storage"
18
  CLEANUP_INTERVAL = 600 # 10 min
19
+ SESSION_TTL = 1000 # 30 min
20
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
21
  OPENROUTER_MODEL = "z-ai/glm-4.5-air:free"
22
 
 
41
  return "No file uploaded.", "", []
42
  session_id = str(uuid.uuid4())
43
  reader = PdfReader(pdf_file.name)
44
+
45
+ # Extract text
46
  text = "".join([page.extract_text() for page in reader.pages if page.extract_text()])
47
 
48
+ # Metadata
49
+ page_count = len(reader.pages)
50
+ first_page_text = reader.pages[0].extract_text() if page_count > 0 else ""
51
+ guessed_title = first_page_text.split("\n")[0] if first_page_text else "Unknown Title"
52
+
53
+ # Split text
54
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
55
  chunks = splitter.split_text(text)
56
 
57
+ # Embeddings + FAISS
58
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
  session_path = os.path.join(STORAGE_DIR, session_id)
60
  os.makedirs(session_path, exist_ok=True)
 
61
  db = FAISS.from_texts(chunks, embeddings)
62
  db.save_local(session_path)
63
 
64
+ # Save metadata
65
+ metadata_path = os.path.join(session_path, "metadata.txt")
66
+ with open(metadata_path, "w", encoding="utf-8") as f:
67
+ f.write(f"title={guessed_title}\n")
68
+ f.write(f"pages={page_count}\n")
69
+
70
+ chat_history = [("System", f"Paper uploaded. Title: {guessed_title}, Pages: {page_count}. You can now ask questions.")]
71
  return f"Paper uploaded successfully. Session ID: {session_id}", session_id, chat_history
72
 
73
  # === QUERY FUNCTION ===
 
81
  return chat_history, ""
82
 
83
  session_path = os.path.join(STORAGE_DIR, session_id)
84
+
85
+ # Load metadata
86
+ metadata_path = os.path.join(session_path, "metadata.txt")
87
+ metadata = {}
88
+ if os.path.exists(metadata_path):
89
+ with open(metadata_path, "r", encoding="utf-8") as f:
90
+ for line in f:
91
+ k, v = line.strip().split("=", 1)
92
+ metadata[k] = v
93
+
94
+ # Load retriever
95
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
96
  db = FAISS.load_local(session_path, embeddings, allow_dangerous_deserialization=True)
97
  retriever = db.as_retriever(search_kwargs={"k": 3})
98
 
99
+ # Retrieve context
100
  docs = retriever.invoke(user_message)
101
  context = "\n\n".join([d.page_content for d in docs])
102
 
103
+ # Prompt
104
  prompt = f"""
105
+ You are an AI assistant that explains research papers in clear, structured, simple terms.
106
+ You can use BOTH metadata and the paper content.
107
+
108
+ Metadata:
109
+ - Title: {metadata.get('title','Unknown')}
110
+ - Pages: {metadata.get('pages','Unknown')}
111
+
112
+ Paper content (retrieved chunks):
113
  {context}
114
 
115
+ User Question: {user_message}
116
+ Answer in plain English with clarity.
117
  """
118
 
119
  headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
120
  payload = {
121
  "model": OPENROUTER_MODEL,
122
  "messages": [
123
+ {"role": "system", "content": "You are a helpful research paper explainer. Use metadata if the user asks about title, authors, or page count. Otherwise, use the retrieved context."},
124
  {"role": "user", "content": prompt}
125
  ]
126
  }
 
136
  except Exception as e:
137
  answer = f"Error: {str(e)}"
138
 
 
139
  chat_history = chat_history or []
140
  chat_history.append((user_message, answer))
141
 
 
149
  pdf_input = gr.File(label="Upload Research Paper (PDF)", file_types=[".pdf"])
150
  session_box = gr.Textbox(label="Session ID", interactive=False)
151
 
152
+ chatbot = gr.Chatbot(label="Chat about your paper", height=400, type="messages")
153
  user_message = gr.Textbox(label="Ask a question", placeholder="What is this paper about?")
154
 
155
  with gr.Row():
 
161
  state_chat = gr.State([])
162
  state_session = gr.State("")
163
 
164
+ # Upload
165
  def handle_upload(pdf_file):
166
  status, session_id, chat_history = process_pdf(pdf_file)
167
  return status, session_id, chat_history
 
172
  outputs=[session_box, state_session, state_chat]
173
  )
174
 
175
+ # Ask
176
  def handle_question(session_id, message, chat_history):
177
  updated_chat, _ = query_paper(session_id, message, chat_history)
178
  return updated_chat, ""
 
187
  outputs=[state_chat]
188
  )
189
 
 
190
  user_message.submit(
191
  fn=handle_question,
192
  inputs=[state_session, user_message, state_chat],
 
197
  outputs=[state_chat]
198
  )
199
 
200
+ # Clear
201
  def clear_chat():
202
  return [], []
203
 
 
206
  outputs=[chatbot, state_chat]
207
  )
208
 
 
209
  state_chat.change(
210
  lambda chat: chat,
211
  inputs=[state_chat],