Spaces:

Afeezee
/

Literature_Review_App

Sleeping

App Files Files Community

Literature_Review_App / app.py

Afeezee

Update app.py

e0485bc verified about 1 year ago

raw

history blame contribute delete

5.57 kB

	import os
	from cerebras.cloud.sdk import Cerebras
	from PyPDF2 import PdfReader
	from docx import Document
	import gradio as gr


	Cerekey = os.getenv("LitReview")

	# Initialize Cerebras AI client with the API key
	client = Cerebras(api_key = Cerekey)


	def extract_text_from_file(file):
	"""Extracts text from uploaded PDF or DOCX files."""
	if file.name.endswith(".pdf"):
	reader = PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text
	elif file.name.endswith(".docx"):
	doc = Document(file)
	text = "\n".join([p.text for p in doc.paragraphs])
	return text
	else:
	return "Unsupported file format. Please upload a PDF or DOCX file."

	def chunk_text(text, max_tokens=4000):
	"""
	Splits text into chunks small enough for the Llama model to process.
	Each chunk is limited to `max_tokens` for safe processing.
	"""
	words = text.split()
	chunks = []
	current_chunk = []

	for word in words:
	current_chunk.append(word)
	if len(" ".join(current_chunk)) > max_tokens:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	def analyze_chunk(chunk):
	"""
	Analyzes a single chunk of text using the Cerebras Llama model.
	"""
	messages = [
	{
	"role": "system",
	"content": (
	"You are an experienced scholar tasked with analyzing research articles. "
	"Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; "
	"Problem addressed; Methodology (datasets, tools, techniques, algorithms); "
	"Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). "
	"Summarize only insights related to these fields and disregard irrelevant content."
	)
	},
	{
	"role": "user",
	"content": chunk
	}
	]

	try:
	# Use Cerebras AI for processing
	stream = client.chat.completions.create(
	messages=messages,
	model="llama-3.3-70b",
	stream=True,
	max_completion_tokens=1024,
	temperature=0.2,
	top_p=1
	)
	result = ""
	for chunk in stream:
	result += chunk.choices[0].delta.content or ""
	return result
	except Exception as e:
	return f"An error occurred while processing a chunk: {e}"
	def save_as_docx(content):
	"""Generates and saves a DOCX file."""
	document = Document()
	document.add_heading("Literature Analysis", level=1)
	document.add_paragraph(content)
	file_path = "Literature_Analysis.docx"
	document.save(file_path)
	return file_path
	def analyze_document(file):
	"""Processes and analyzes the uploaded document."""
	text = extract_text_from_file(file)
	if text.startswith("Unsupported file format"):
	yield f"Error: {text}"
	return

	chunks = chunk_text(text)
	all_insights = []

	yield "Processing the document. Please wait...\n"
	for i, chunk in enumerate(chunks, 1):
	yield f"Processing chunk {i} of {len(chunks)}..."
	result = analyze_chunk(chunk)
	if result.strip(): # Only append non-empty results
	all_insights.append(result)

	if not all_insights:
	yield "Error: No valid insights were extracted from the document."
	return

	yield "Consolidating all insights into a final summary..."
	consolidated_summary_prompt = (
	"Below are insights extracted from multiple chunks of a document. "
	"Consolidate these insights into a single output organized as follows: "
	"Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. "
	"Make the final output concise and coherent."
	)

	try:
	stream = client.chat.completions.create(
	messages=[
	{"role": "system", "content": consolidated_summary_prompt},
	{"role": "user", "content": "\n\n".join(all_insights)}
	],
	model="llama-3.3-70b",
	stream=True,
	max_completion_tokens=1024,
	temperature=0.2,
	top_p=1
	)
	final_summary = ""
	for chunk in stream:
	final_summary += chunk.choices[0].delta.content or ""
	yield f"Final Summary:\n\n{final_summary}"
	except Exception as e:
	yield f"Error: An error occurred during consolidation: {e}"

	# Generate DOCX file after processing
	docx_file = save_as_docx(final_summary)
	return progress_output, docx_file
	except Exception as e:
	return f"Error: An error occurred during consolidation: {e}", None


	# Define the Gradio interface
	interface = gr.Interface(
	fn= analyze_document,
	inputs=gr.File(label="Upload a PDF or DOCX file"),
	outputs=gr.Markdown(label="Progress and Analysis"),
	title="Automated Literature Review",
	description=(
	"Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. "
	"It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files."
	),
	)

	# Launch the interface
	if __name__ == "__main__":
	interface.launch()