Implement a in-memory Embedbase
⚠️ This is not recommended for production. ⚠️
To get started quickly, you can implement a in-memory Embedbase database. This is not recommended for production, but it's useful for testing and prototyping.
Installation
Install the required dependencies in a virtual environment:
virtualenv env
source env/bin/activate
pip install embedbase numpy
Start Embedbase
Create a new file main.py
with the following code:
main.py
import uvicorn
import numpy as np
from embedbase import get_app
from embedbase.database.base import VectorDatabase
from embedbase.embedding.openai import OpenAI
class InMemoryDatabase(VectorDatabase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.storage = {}
async def update(
self, df, dataset_id, user_id=None, store_data=True, batch_size=100
):
for _, row in df.iterrows():
doc_id = row.id
self.storage[doc_id] = {
"data": row.data if store_data else None,
"embedding": np.array(row.embedding),
"metadata": row.metadata,
"dataset_id": dataset_id,
"user_id": user_id,
"hash": row.hash,
}
return True
async def select(self, ids=[], hashes=[], dataset_id=None, user_id=None):
if ids:
return [
self.storage[doc_id]
for doc_id in ids
if doc_id in self.storage
and (
dataset_id is None
or self.storage[doc_id]["dataset_id"] == dataset_id
)
and (user_id is None or self.storage[doc_id]["user_id"] == user_id)
]
elif hashes:
return [
{
"id": id,
"data": doc["data"],
"embedding": doc["embedding"].tolist(),
"metadata": doc["metadata"],
"hash": doc["hash"],
}
for id, doc in self.storage.items()
if doc["hash"] in hashes
and (dataset_id is None or doc["dataset_id"] == dataset_id)
and (user_id is None or doc["user_id"] == user_id)
]
else:
return []
async def search(self, vector, top_k, dataset_ids, user_id=None):
query_embedding = np.array(vector)
similarities = [
(
doc_id,
np.dot(doc["embedding"], query_embedding)
/ (np.linalg.norm(doc["embedding"]) * np.linalg.norm(query_embedding)),
)
for doc_id, doc in self.storage.items()
if doc["dataset_id"] in dataset_ids
and (user_id is None or doc["user_id"] == user_id)
]
similarities.sort(key=lambda x: x[1], reverse=True)
return [
{
"id": doc_id,
"score": sim,
"data": self.storage[doc_id]["data"],
"metadata": self.storage[doc_id]["metadata"],
"embedding": self.storage[doc_id]["embedding"].tolist(),
"hash": self.storage[doc_id]["hash"],
}
for doc_id, sim in similarities[:top_k]
]
async def delete(self, ids, dataset_id, user_id=None):
for doc_id in ids:
if (
doc_id in self.storage
and (
dataset_id is None
or self.storage[doc_id]["dataset_id"] == dataset_id
)
and (user_id is None or self.storage[doc_id]["user_id"] == user_id)
):
del self.storage[doc_id]
return None
async def get_datasets(self, user_id=None):
datasets = {}
for doc in self.storage.values():
if user_id is None or doc["user_id"] == user_id:
dataset_id = doc["dataset_id"]
if dataset_id not in datasets:
datasets[dataset_id] = 1
else:
datasets[dataset_id] += 1
return [{"dataset_id": k, "documents_count": v} for k, v in datasets.items()]
async def clear(self, dataset_id, user_id=None):
doc_ids_to_remove = [
doc_id
for doc_id, doc in self.storage.items()
if doc["dataset_id"] == dataset_id
and (user_id is None or doc["user_id"] == user_id)
]
for doc_id in doc_ids_to_remove:
del self.storage[doc_id]
return None
app = (get_app().use_embedder(OpenAI("<your-openai-api-key>)).use_db(InMemoryDatabase())).run()
if __name__ == "__main__":
uvicorn.run("main:app")
Start the Embedbase application with the following command:
python3 main.py
Test the endpoint
TypeScript
import { createClient } from 'embedbase-js'
const embedbase = createClient("http://localhost:8000")
const SENTENCES = [
"The lion is the king of the savannah.",
"The chimpanzee is a great ape.",
"The elephant is the largest land animal.",
];
const DATASET_ID = "animals"
const add = async () => {
const data = await embedbase
.dataset(DATASET_ID)
.batchAdd(SENTENCES.map((data) => ({ data })))
console.log(data)
}
add()
You should get a similar response:
{
"results": [
{
"data": "The lion is the king of the savannah.",
"embedding": [...],
"hash": ...,
"metadata": null
},
{
"data": "The chimpanzee is a great ape.",
"embedding": [...],
"hash": ...,
"metadata": null
},
{
"data": "The elephant is the largest land animal.",
"embedding": [...],
"hash": ...,
"metadata": null
}
]
}
Let's try to search now
TypeScript
const search = async () => {
const data = await embedbase
.dataset(DATASET_ID)
.search("Animal that lives in the savannah", { limit: 1})
console.log(data)
}
search()
You should get a similar response:
"The lion is the king of the savannah."
When you want to move to production, you can use Embedbase Cloud (opens in a new tab) or Supabase
, or Postgres
as a database.