A RAG application that works perfectly with toy datasets grinds to a halt at production scale. The vector database that benchmarked beautifully with 10K vectors performs terribly at 10M. The one that claimed “blazing fast” similarity search becomes unusable when you actually need metadata filtering. Choosing a vector database based on GitHub stars or marketing claims is a recipe for production disasters.
Rigorous benchmarking against realistic workloads is essential. The tradeoffs between performance, cost, features, and operational complexity are real and vary by use case.
Benchmarking Framework
This diagram requires JavaScript.
Enable JavaScript in your browser to use this feature.
Performance Benchmarking
Test Dataset Creation
Realistic workloads, not random vectors:
class VectorBenchmarkDataset:
def __init__(self, target_size, vector_dim=768):
self.target_size = target_size
self.vector_dim = vector_dim
self.datasets = {}
def generate_realistic_dataset(self):
"""Generate dataset mimicking legal documents"""
seed_embeddings = self.load_legal_embeddings()
dataset = {
'vectors': [],
'metadata': [],
'payloads': []
}
for i in range(self.target_size):
if i < len(seed_embeddings):
base_vector = seed_embeddings[i]
else:
cluster_id = i % len(seed_embeddings)
base_vector = seed_embeddings[cluster_id]
# Add realistic noise
noise = np.random.normal(0, 0.1, self.vector_dim)
vector = base_vector + noise
vector = vector / np.linalg.norm(vector)
dataset['vectors'].append(vector)
# Realistic metadata
dataset['metadata'].append({
'document_id': f'DOC_{i:08d}',
'case_id': f'CASE_{i % 10000:05d}',
'date': self.generate_realistic_date(),
'document_type': self.random_document_type(),
'jurisdiction': self.random_jurisdiction(),
'practice_area': self.random_practice_area(),
'client_id': f'CLIENT_{i % 1000:04d}',
'matter_id': f'MATTER_{i % 5000:05d}'
})
dataset['payloads'].append({
'title': self.generate_document_title(),
'excerpt': self.generate_document_excerpt(),
'tags': self.generate_tags(),
'file_size': np.random.lognormal(10, 2),
'page_count': max(1, int(np.random.lognormal(2, 1)))
})
return dataset
def generate_query_workload(self, dataset, num_queries=10000):
"""Generate realistic query patterns"""
queries = []
# 60% single document retrieval (known item search)
for _ in range(int(num_queries * 0.6)):
vector = dataset['vectors'][np.random.randint(len(dataset['vectors']))]
queries.append({
'type': 'single_document',
'vector': vector + np.random.normal(0, 0.05, self.vector_dim),
'top_k': 1,
'filters': None
})
# 30% similar document search
for _ in range(int(num_queries * 0.3)):
vector = dataset['vectors'][np.random.randint(len(dataset['vectors']))]
queries.append({
'type': 'similar_documents',
'vector': vector + np.random.normal(0, 0.1, self.vector_dim),
'top_k': 20,
'filters': {
'case_id': f'CASE_{np.random.randint(10000):05d}'
}
})
# 10% complex discovery queries
for _ in range(int(num_queries * 0.1)):
vector = self.generate_concept_vector()
queries.append({
'type': 'discovery',
'vector': vector,
'top_k': 100,
'filters': {
'date': {'$gte': '2020-01-01', '$lte': '2023-12-31'},
'practice_area': {'$in': self.random_practice_areas(3)},
'jurisdiction': self.random_jurisdiction()
}
})
return queries
Latency Testing
class LatencyBenchmark:
def __init__(self, vector_db_client):
self.client = vector_db_client
self.results = defaultdict(list)
def benchmark_latency(self, queries, concurrent_users=1):
"""Measure query latency under various conditions"""
latency_results = {
'p50': [],
'p95': [],
'p99': [],
'p999': [],
'mean': [],
'errors': 0
}
if concurrent_users == 1:
latencies = []
for query in tqdm(queries):
start = time.perf_counter()
try:
result = self.client.search(
vector=query['vector'],
top_k=query['top_k'],
filter=query.get('filters')
)
latency = (time.perf_counter() - start) * 1000
latencies.append(latency)
except Exception as e:
latency_results['errors'] += 1
latency_results.update(self.calculate_percentiles(latencies))
else:
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent_users) as executor:
futures = []
for query in queries:
future = executor.submit(self.execute_query, query)
futures.append(future)
latencies = []
for future in concurrent.futures.as_completed(futures):
try:
latency = future.result()
if latency is not None:
latencies.append(latency)
except Exception as e:
latency_results['errors'] += 1
latency_results.update(self.calculate_percentiles(latencies))
return latency_results
Throughput Testing
class ThroughputBenchmark:
def __init__(self, vector_db_client):
self.client = vector_db_client
def benchmark_throughput(self, queries, duration_seconds=300):
"""Measure sustainable queries per second"""
completed_queries = 0
errors = 0
start_time = time.time()
# Warm up
for _ in range(100):
self.client.search(
vector=queries[0]['vector'],
top_k=10
)
# Benchmark
query_times = []
while time.time() - start_time < duration_seconds:
query = queries[completed_queries % len(queries)]
query_start = time.perf_counter()
try:
self.client.search(
vector=query['vector'],
top_k=query['top_k'],
filter=query.get('filters')
)
query_time = time.perf_counter() - query_start
query_times.append(query_time)
completed_queries += 1
except Exception as e:
errors += 1
elapsed = time.time() - start_time
return {
'qps': completed_queries / elapsed,
'total_queries': completed_queries,
'errors': errors,
'error_rate': errors / (completed_queries + errors),
'mean_latency': np.mean(query_times) * 1000,
'p95_latency': np.percentile(query_times, 95) * 1000
}
def find_saturation_point(self, queries, max_concurrent=200):
"""Find the point where throughput stops scaling"""
results = []
for concurrent in [1, 5, 10, 20, 50, 100, 150, 200]:
if concurrent > max_concurrent:
break
print(f"Testing with {concurrent} concurrent connections...")
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent) as executor:
futures = []
start_time = time.time()
for _ in range(concurrent):
future = executor.submit(
self.run_continuous_queries,
queries,
30
)
futures.append(future)
total_completed = 0
total_errors = 0
for future in concurrent.futures.as_completed(futures):
result = future.result()
total_completed += result['completed']
total_errors += result['errors']
elapsed = time.time() - start_time
qps = total_completed / elapsed
results.append({
'concurrent_connections': concurrent,
'qps': qps,
'total_queries': total_completed,
'errors': total_errors,
'efficiency': qps / concurrent
})
if len(results) > 1 and results[-1]['qps'] < results[-2]['qps'] * 1.1:
print(f"Saturation detected at {concurrent} connections")
break
return results
Cost Analysis
class VectorDBCostCalculator:
def __init__(self):
self.pricing_models = {
'pinecone': {
'type': 'managed',
'pod_types': {
'p1.x1': {'vcpu': 1, 'memory': 8, 'price_hour': 0.096},
'p1.x2': {'vcpu': 2, 'memory': 16, 'price_hour': 0.192},
'p1.x4': {'vcpu': 4, 'memory': 32, 'price_hour': 0.384},
'p2.x1': {'vcpu': 1, 'memory': 10, 'price_hour': 0.140},
'p2.x4': {'vcpu': 4, 'memory': 40, 'price_hour': 0.560},
},
'storage_gb_month': 0.025,
'queries_per_month_free': 100000,
'price_per_million_queries': 8.25
},
'weaviate': {
'type': 'self_hosted',
'instance_recommendations': {
'small': 'c5.2xlarge',
'medium': 'c5.4xlarge',
'large': 'c5.9xlarge',
'xlarge': 'c5.18xlarge'
}
},
'qdrant': {
'type': 'hybrid',
'cloud_pricing': {
'free_tier_vectors': 1000000,
'price_per_million_vectors_month': 95
}
},
'pgvector': {
'type': 'self_hosted',
'instance_recommendations': {
'small': 'db.r6g.xlarge',
'medium': 'db.r6g.2xlarge',
'large': 'db.r6g.4xlarge',
'xlarge': 'db.r6g.8xlarge'
}
}
}
def calculate_total_cost(self, db_type, vectors_count, queries_per_month,
vector_dim=768, metadata_size_bytes=1000):
"""Calculate total cost of ownership"""
vector_size_gb = (vectors_count * vector_dim * 4) / (1024**3)
metadata_size_gb = (vectors_count * metadata_size_bytes) / (1024**3)
index_overhead = 1.5
total_storage_gb = (vector_size_gb + metadata_size_gb) * index_overhead
costs = {
'storage': 0,
'compute': 0,
'queries': 0,
'operations': 0,
'total_monthly': 0
}
if db_type == 'pinecone':
vectors_per_pod = 1000000
num_pods = math.ceil(vectors_count / vectors_per_pod)
if queries_per_month < 1000000:
pod_type = 'p1.x1'
elif queries_per_month < 10000000:
pod_type = 'p1.x2'
else:
pod_type = 'p1.x4'
pod_price = self.pricing_models['pinecone']['pod_types'][pod_type]['price_hour']
costs['compute'] = num_pods * pod_price * 24 * 30
costs['storage'] = total_storage_gb * self.pricing_models['pinecone']['storage_gb_month']
billable_queries = max(0, queries_per_month - self.pricing_models['pinecone']['queries_per_month_free'])
costs['queries'] = (billable_queries / 1000000) * self.pricing_models['pinecone']['price_per_million_queries']
elif db_type == 'weaviate':
if vectors_count < 10000000:
instance_type = 'small'
elif vectors_count < 50000000:
instance_type = 'medium'
elif vectors_count < 200000000:
instance_type = 'large'
else:
instance_type = 'xlarge'
instance_costs = {
'small': 0.17 * 24 * 30,
'medium': 0.34 * 24 * 30,
'large': 0.765 * 24 * 30,
'xlarge': 1.53 * 24 * 30
}
costs['compute'] = instance_costs[instance_type]
costs['storage'] = total_storage_gb * 0.10
costs['operations'] = (costs['compute'] + costs['storage']) * 0.20
elif db_type == 'pgvector':
if vectors_count < 5000000:
instance_type = 'small'
elif vectors_count < 25000000:
instance_type = 'medium'
elif vectors_count < 100000000:
instance_type = 'large'
else:
instance_type = 'xlarge'
instance_costs = {
'small': 0.252 * 24 * 30,
'medium': 0.504 * 24 * 30,
'large': 1.008 * 24 * 30,
'xlarge': 2.016 * 24 * 30
}
costs['compute'] = instance_costs[instance_type]
costs['storage'] = total_storage_gb * 0.115
costs['operations'] = costs['storage']
costs['total_monthly'] = sum(costs.values())
return costs
Feature Comparison
class FeatureEvaluator:
def __init__(self):
self.feature_matrix = {
'pinecone': {
'vector_similarity': ['cosine', 'euclidean', 'dot_product'],
'metadata_filtering': True,
'hybrid_search': False,
'full_text_search': False,
'multi_vector': False,
'transactions': False,
'backups': True,
'multi_tenancy': True,
'access_control': True,
'encryption_at_rest': True,
'encryption_in_transit': True,
'sparse_vectors': True,
'streaming_updates': True,
'bulk_operations': True,
'sdk_languages': ['python', 'node', 'go', 'java'],
'max_vector_dim': 20000,
'max_metadata_size': '40kb',
'consistency_model': 'eventual',
'deployment_options': ['cloud']
},
'weaviate': {
'vector_similarity': ['cosine', 'euclidean', 'dot_product', 'hamming'],
'metadata_filtering': True,
'hybrid_search': True,
'full_text_search': True,
'multi_vector': True,
'transactions': False,
'backups': True,
'multi_tenancy': True,
'access_control': True,
'encryption_at_rest': True,
'encryption_in_transit': True,
'sparse_vectors': False,
'streaming_updates': True,
'bulk_operations': True,
'sdk_languages': ['python', 'node', 'go', 'java', 'typescript'],
'max_vector_dim': 65535,
'max_metadata_size': '10mb',
'consistency_model': 'eventual',
'deployment_options': ['cloud', 'self-hosted', 'kubernetes']
},
'qdrant': {
'vector_similarity': ['cosine', 'euclidean', 'dot_product'],
'metadata_filtering': True,
'hybrid_search': False,
'full_text_search': True,
'multi_vector': True,
'transactions': False,
'backups': True,
'multi_tenancy': False,
'access_control': True,
'encryption_at_rest': True,
'encryption_in_transit': True,
'sparse_vectors': True,
'streaming_updates': True,
'bulk_operations': True,
'sdk_languages': ['python', 'rust', 'go', 'typescript'],
'max_vector_dim': 65536,
'max_metadata_size': '1mb',
'consistency_model': 'eventual',
'deployment_options': ['cloud', 'self-hosted', 'docker']
},
'pgvector': {
'vector_similarity': ['cosine', 'euclidean', 'inner_product'],
'metadata_filtering': True,
'hybrid_search': True,
'full_text_search': True,
'multi_vector': True,
'transactions': True,
'backups': True,
'multi_tenancy': True,
'access_control': True,
'encryption_at_rest': True,
'encryption_in_transit': True,
'sparse_vectors': False,
'streaming_updates': True,
'bulk_operations': True,
'sdk_languages': ['any'],
'max_vector_dim': 16000,
'max_metadata_size': '1gb',
'consistency_model': 'strong',
'deployment_options': ['self-hosted', 'cloud', 'managed']
}
}
Benchmarking Best Practices
Realistic Workloads
- Use production data distributions
- Model actual query patterns
- Include metadata filtering
- Test at production scale
Comprehensive Metrics
- Measure percentiles, not just averages
- Track resource usage alongside performance
- Monitor performance degradation over time
- Include failure scenarios
Fair Comparisons
- Use identical hardware where possible
- Warm up systems before testing
- Run multiple iterations
- Document all configurations
Consider Total Cost
- Include operational overhead
- Factor in expertise requirements
- Account for growth projections
- Consider migration costs
Decision Rules
Choose based on your constraints:
Pinecone: When you want fully managed, minimal ops burden, and can pay premium pricing.
Qdrant: When you need best price/performance ratio and can manage your own infrastructure.
Weaviate: When you need hybrid search (vector + keyword) and rich features.
pgvector: When you already use PostgreSQL and need strong consistency with transactional support.
Multi-database approach: Different use cases may warrant different databases—a common pattern is Qdrant for primary search and pgvector for data requiring strong consistency.
Key principles:
- Test with realistic workloads, not toy datasets
- Consider TCO, not just licensing costs
- Plan for growth—today’s solution may not scale
- Different databases suit different use cases