Saturday, August 24, 2024

Adding RAG to my Dynamic AI Agent Workflows

I modified the script to properly created paragraph sized chunks to be returned on semantic search. I modified the script that uses the data to properly return rag enhancements to a prompt.

Here's a Python script to create a RAG database from text and PDF documents. This script will:

  1. Process text and PDF files
  2. Create embeddings for the content
  3. Build a FAISS index for efficient retrieval

Here's the script:

import os
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import re

# readme
# add these libraries
#
# pip install PyMuPDF faiss-cpu numpy sentence-transformers
#

def read_text_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()

def read_pdf_file(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text

def process_documents(directory):
documents = []
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if filename.endswith('.txt'):
text = read_text_file(file_path)

pattern = r'(?<=[.!?])\s*\n+'
paragraphs = re.split(pattern, text)
paragraphs = [para.strip() for para in paragraphs if para.strip()]

documents.extend([para.strip() for para in paragraphs if para.strip()]) # Add non-empty paragraphs
elif filename.endswith('.pdf'):
text = read_pdf_file(file_path)
#print (text)
#paragraphs = text.split(".") # Split by double newline (common paragraph separator)
pattern = r'(?<=[.!?])\s*\n+'
paragraphs = re.split(pattern, text)
paragraphs = [para.strip() for para in paragraphs if para.strip()]
documents.extend([para.strip() for para in paragraphs if para.strip()]) # Add non-empty paragraphs
return documents

def create_rag_database(directory):
# Process documents
documents = process_documents(directory)
# Create embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(documents)
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))
return index, documents

# Usage
directory = "docs/"
index, documents = create_rag_database(directory)

# Save the index and documents for later use
faiss.write_index(index, "rag_index.faiss")
np.save("document_chunks.npy", documents)

print(f"RAG database created with {len(documents)} document chunks.")

To use this script:

  1. Install required libraries:
    pip install PyMuPDF faiss-cpu numpy sentence-transformers
  2. Replace /path/to/your/documents with the actual path to the directory containing your text and PDF files.
  3. Run the script. It will create two files:
    • rag_index.faiss: The FAISS index for efficient similarity search
    • documents.npy: A NumPy array containing the original documents

Now you can load these files in your RAG-enhanced workflow:


import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import textwrap
def rag_annotate(prompt:str, path:str, output: list, k:int =5)-> tuple[str, Dict]:
# Check if data is already loaded

k = int(k)
if not hasattr(rag_annotate, 'index'):
# Load data for the first time
rag_annotate.index = faiss.read_index(f"{path}/rag_index.faiss")
rag_annotate.document_chunks = np.load(f"{path}/document_chunks.npy", allow_pickle=True)
rag_annotate.embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Use the loaded data
prompt_embedding = rag_annotate.embedder.encode([prompt])
_, I = rag_annotate.index.search(prompt_embedding, k)
retrieved_chunks = [rag_annotate.document_chunks[i] for i in I[0]]
context = "\n".join(retrieved_chunks)
return (f"Context:\n{context}\n\nQuestion: {prompt}\nAnswer:",
            {"status": {"value": 0, "reason": "Success"}})

This setup allows you to create a RAG database from your documents and then use it in your workflow to enhance prompts before sending them to an LLM. The database creation is done separately, so you only need to run it when you want to update your document set.


No comments:

Post a Comment