AzizWazir commited on
Commit
1b61846
·
verified ·
1 Parent(s): 5846262

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -54
app.py CHANGED
@@ -10,40 +10,12 @@ import docx
10
  import pandas as pd
11
 
12
  # Initialize the summarization and question-answering models from Hugging Face
13
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=100) # Concise summary settings
14
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Better QA model for concise answers
15
 
16
  # Sentence Transformer for embedding-based retrieval
17
  embedder = SentenceTransformer('all-MiniLM-L6-v2') # A compact and efficient embedding model
18
 
19
- # Groq API Configuration
20
- API_KEY = "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" # Replace with your actual Groq API key
21
- API_URL = "https://api.groq.com/openai/v1/chat/completions" # Default endpoint for chat completions (check Groq docs)
22
-
23
- # PDF Processing Function
24
- def extract_text_from_pdf(pdf_file):
25
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
26
- text = ""
27
- for page in doc:
28
- text += page.get_text()
29
- return text
30
-
31
- # MS Word Processing Function
32
- def extract_text_from_word(word_file):
33
- doc = docx.Document(BytesIO(word_file.read()))
34
- text = ""
35
- for para in doc.paragraphs:
36
- text += para.text + "\n"
37
- return text
38
-
39
- # Excel File Processing Function
40
- def extract_text_from_excel(excel_file):
41
- df = pd.read_excel(excel_file, engine="openpyxl") # Use openpyxl to read .xlsx files
42
- text = ""
43
- for col in df.columns:
44
- text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n" # Join values in each column
45
- return text
46
-
47
  # FAISS Indexing Function with better embedding-based chunking
48
  def create_faiss_index(text):
49
  paragraphs = text.split('\n\n') # Assuming paragraphs are separated by double newlines
@@ -58,28 +30,8 @@ def retrieve_relevant_chunk(query, index, paragraphs):
58
  D, I = index.search(np.array(query_embedding).astype(np.float32), 1) # Search for the closest chunk
59
  return paragraphs[I[0][0]] # Return the most relevant paragraph
60
 
61
- # Function to call Groq API for summarization (optional)
62
- def call_groq_api(input_text):
63
- headers = {
64
- "Authorization": f"Bearer {API_KEY}",
65
- "Content-Type": "application/json"
66
- }
67
- payload = {
68
- "model": "gpt-3.5-turbo", # Use the correct model if available
69
- "messages": [{"role": "user", "content": input_text}],
70
- "n": 1
71
- }
72
- try:
73
- response = requests.post(API_URL, json=payload, headers=headers)
74
- if response.status_code == 200:
75
- return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No result found")
76
- else:
77
- return f"Error: {response.status_code} - {response.text}"
78
- except Exception as e:
79
- return f"Error: {str(e)}"
80
-
81
  # Streamlit UI
82
- st.title("RAG-based PDF, Word, Excel Summarizer and Q&A")
83
 
84
  # Upload File
85
  uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
@@ -89,11 +41,20 @@ if uploaded_file:
89
 
90
  # Extract text based on file type
91
  if file_type == "application/pdf":
92
- text = extract_text_from_pdf(uploaded_file)
 
 
 
93
  elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
94
- text = extract_text_from_word(uploaded_file)
 
 
 
95
  elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
96
- text = extract_text_from_excel(uploaded_file)
 
 
 
97
  else:
98
  st.error("Unsupported file type!")
99
  text = ""
@@ -124,7 +85,7 @@ if uploaded_file:
124
  if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
125
  try:
126
  st.write("Summarizing...")
127
- summary = summarizer(relevant_chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
128
  st.write(f"Summary: {summary}")
129
  except Exception as e:
130
  st.write(f"Error summarizing text: {str(e)}")
 
10
  import pandas as pd
11
 
12
  # Initialize the summarization and question-answering models from Hugging Face
13
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", min_length=50, max_length=80) # Concise summary settings
14
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") # Better QA model for concise answers
15
 
16
  # Sentence Transformer for embedding-based retrieval
17
  embedder = SentenceTransformer('all-MiniLM-L6-v2') # A compact and efficient embedding model
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # FAISS Indexing Function with better embedding-based chunking
20
  def create_faiss_index(text):
21
  paragraphs = text.split('\n\n') # Assuming paragraphs are separated by double newlines
 
30
  D, I = index.search(np.array(query_embedding).astype(np.float32), 1) # Search for the closest chunk
31
  return paragraphs[I[0][0]] # Return the most relevant paragraph
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Streamlit UI
34
+ st.title("Concise Summarizer and Q&A")
35
 
36
  # Upload File
37
  uploaded_file = st.file_uploader("Upload a PDF, Word, or Excel file", type=["pdf", "docx", "xlsx"])
 
41
 
42
  # Extract text based on file type
43
  if file_type == "application/pdf":
44
+ doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
45
+ text = ""
46
+ for page in doc:
47
+ text += page.get_text()
48
  elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
49
+ doc = docx.Document(BytesIO(uploaded_file.read()))
50
+ text = ""
51
+ for para in doc.paragraphs:
52
+ text += para.text + "\n"
53
  elif file_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
54
+ df = pd.read_excel(uploaded_file, engine="openpyxl") # Use openpyxl to read .xlsx files
55
+ text = ""
56
+ for col in df.columns:
57
+ text += "\n".join(df[col].dropna().astype(str).tolist()) + "\n"
58
  else:
59
  st.error("Unsupported file type!")
60
  text = ""
 
85
  if len(relevant_chunk.split()) > 20: # Only summarize if the text is sufficiently long
86
  try:
87
  st.write("Summarizing...")
88
+ summary = summarizer(relevant_chunk, max_length=80, min_length=50, do_sample=False)[0]['summary_text']
89
  st.write(f"Summary: {summary}")
90
  except Exception as e:
91
  st.write(f"Error summarizing text: {str(e)}")