Voici un script Python très simple pour tester les performances d’Ollama et d’un modèle en Python, ici mistral.
import time
import requests
import json
def benchmark_ollama(server, model, prompt, num_runs=5):
times = []
tokens_per_sec = []
for i in range(num_runs):
start = time.time()
response = requests.post(server,
json={
'model': model,
'prompt': prompt,
'stream': False
})
end = time.time()
data = response.json()
times.append(end - start)
if 'eval_duration' in data:
# eval_duration est en nanosecondes
tokens = data.get('eval_count', 0)
duration_sec = data['eval_duration'] / 1e9
tokens_per_sec.append(tokens / duration_sec if duration_sec > 0 else 0)
print(f"Serveur: {server}")
print(f"Modèle: {model}")
print(f"Temps moyen: {sum(times)/len(times):.2f}s")
print(f"Tokens/sec moyen: {sum(tokens_per_sec)/len(tokens_per_sec):.2f}")
print(f"Min: {min(times):.2f}s, Max: {max(times):.2f}s")
# Utilisation
benchmark_ollama('http://localhost:11434/api/generate', 'mistral', 'Explique-moi la physique quantique', num_runs=3)
#benchmark_ollama('http://X.X.X.X:11434/api/generate', 'mistral', 'Explique-moi la physique quantique', num_runs=3)