Documentation Index Fetch the complete documentation index at: https://docs.gulp.ai/llms.txt
Use this file to discover all available pages before exploring further.
Reward rubrics leverage LLMs to evaluate outputs using natural language criteria. They use the @osmosis_rubric decorator and delegate scoring to a language model based on a rubric description.
Basic Example
File: reward_rubric/reward_rubric_openai.py
from osmosis_ai import evaluate_rubric, osmosis_rubric
import os
RUBRIC = "Reward based on whether the predicted numerical value matches the ground truth."
SCORE_MIN = 0.0
SCORE_MAX = 1.0
PROVIDER = "openai"
MODEL = "gpt-5"
API_KEY = os.getenv( "OPENAI_API_KEY" )
@osmosis_rubric
def compute_rubric_score_openai (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
"""
Delegate rubric scoring to OpenAI GPT model.
"""
model_info = {
"provider" : PROVIDER ,
"model" : MODEL ,
"api_key" : API_KEY
}
result = evaluate_rubric(
rubric = RUBRIC ,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
metadata = extra_info.get( "metadata" ),
score_min = SCORE_MIN ,
score_max = SCORE_MAX ,
return_details = False ,
)
return float (result)
Function Signature
@osmosis_rubric
def my_rubric_function (
solution_str : str , # The LLM output to evaluate
ground_truth : str , # The expected correct answer
extra_info : dict , # Additional context/metadata (required, no default)
** kwargs # Future compatibility (REQUIRED)
) -> float : # Return score from the rubric
pass
The evaluate_rubric Function
The evaluate_rubric helper function handles the LLM evaluation:
from osmosis_ai import evaluate_rubric
score = evaluate_rubric(
rubric = "Your evaluation criteria here" ,
solution_str = "The output to evaluate" ,
model_info = {
"provider" : "openai" ,
"model" : "gpt-5" ,
"api_key" : "your-api-key"
},
ground_truth = "Expected answer" ,
metadata = { "additional" : "context" },
score_min = 0.0 ,
score_max = 1.0 ,
return_details = False
)
Parameters
Parameter Type Description rubricstrNatural language description of evaluation criteria solution_strstrThe LLM output to evaluate model_infodictProvider, model, and API key configuration ground_truthstrExpected correct answer or reference metadatadictOptional additional context score_minfloatMinimum score value (default: 0.0) score_maxfloatMaximum score value (default: 1.0) return_detailsboolWhether to return detailed explanation
Supported Providers
OpenAI
from osmosis_ai import evaluate_rubric, osmosis_rubric
import os
@osmosis_rubric
def openai_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
model_info = {
"provider" : "openai" ,
"model" : "gpt-5" ,
"api_key" : os.getenv( "OPENAI_API_KEY" )
}
return evaluate_rubric(
rubric = "Evaluate the solution quality..." ,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
score_min = 0.0 ,
score_max = 1.0
)
Anthropic
@osmosis_rubric
def anthropic_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
model_info = {
"provider" : "anthropic" ,
"model" : "claude-sonnet-4-5" ,
"api_key" : os.getenv( "ANTHROPIC_API_KEY" )
}
return evaluate_rubric(
rubric = "Evaluate based on accuracy and clarity..." ,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
score_min = 0.0 ,
score_max = 1.0
)
For additional providers (Google Gemini, xAI Grok, OpenRouter, Cerebras), see the API Reference .
Writing Effective Rubrics
Be Specific
# Vague
rubric = "Score the answer quality"
# Specific
rubric = """
Evaluate the solution based on:
1. Correctness: Does it match the ground truth? (50%)
2. Explanation: Is the reasoning clear? (30%)
3. Formatting: Is it well-structured? (20%)
Return a score from 0.0 to 1.0.
"""
Include Scoring Guidelines
rubric = """
Score the code quality from 0.0 to 1.0 based on:
- 1.0: Perfect - Correct, efficient, well-documented
- 0.7-0.9: Good - Correct with minor style issues
- 0.4-0.6: Fair - Works but has problems
- 0.0-0.3: Poor - Incorrect or seriously flawed
Ground truth: {ground_truth}
"""
Provide Examples
rubric = """
Evaluate if the SQL query correctly answers the question.
Examples:
- "SELECT * FROM users WHERE age > 18" for "users over 18" → 1.0
- "SELECT name FROM users WHERE age >= 18" for "users over 18" → 0.8 (missing users exactly 18)
- "SELECT * FROM products" for "users over 18" → 0.0 (wrong table)
Score from 0.0 (completely wrong) to 1.0 (perfect).
"""
Advanced Patterns
Multi-Aspect Evaluation
@osmosis_rubric
def comprehensive_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
rubric = """
Evaluate the solution across multiple dimensions:
1. Factual Accuracy (40%): Is the information correct?
2. Completeness (30%): Does it address all parts of the question?
3. Clarity (20%): Is it easy to understand?
4. Conciseness (10%): Is it appropriately brief?
Compare against ground truth: {ground_truth}
Return a weighted average score from 0.0 to 1.0.
"""
model_info = {
"provider" : "anthropic" ,
"model" : "claude-sonnet-4-5" ,
"api_key" : os.getenv( "ANTHROPIC_API_KEY" )
}
return evaluate_rubric(
rubric = rubric,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
metadata = extra_info.get( "metadata" ),
score_min = 0.0 ,
score_max = 1.0
)
Context-Aware Rubric
@osmosis_rubric
def context_aware_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
# Extract context from extra_info
difficulty = extra_info.get( "metadata" , {}).get( "difficulty" , "medium" )
rubric = f """
Evaluate the solution for a { difficulty } difficulty problem.
Criteria:
- Correctness: Must match ground truth logic
- Approach: Should be appropriate for { difficulty } level
- Efficiency: Expected to be { get_efficiency_requirement(difficulty) }
Ground truth: {{ ground_truth }}
Score from 0.0 to 1.0.
"""
model_info = {
"provider" : "openai" ,
"model" : "gpt-5" ,
"api_key" : os.getenv( "OPENAI_API_KEY" )
}
return evaluate_rubric(
rubric = rubric,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
metadata = extra_info.get( "metadata" ),
score_min = 0.0 ,
score_max = 1.0
)
Getting Detailed Feedback
@osmosis_rubric
def detailed_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
rubric = "Evaluate the solution quality and provide detailed feedback."
model_info = {
"provider" : "anthropic" ,
"model" : "claude-sonnet-4-5" ,
"api_key" : os.getenv( "ANTHROPIC_API_KEY" )
}
# Get detailed result
result = evaluate_rubric(
rubric = rubric,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
score_min = 0.0 ,
score_max = 1.0 ,
return_details = True # Returns dict with score and explanation
)
# Log the explanation for debugging
if isinstance (result, dict ):
print ( f "Score: { result[ 'score' ] } " )
print ( f "Reasoning: { result[ 'explanation' ] } " )
return float (result[ 'score' ])
return float (result)
Error Handling
Always handle errors gracefully:
@osmosis_rubric
def robust_rubric (
solution_str : str ,
ground_truth : str ,
extra_info : dict ,
** kwargs
) -> float :
try :
model_info = {
"provider" : "openai" ,
"model" : "gpt-5" ,
"api_key" : os.getenv( "OPENAI_API_KEY" )
}
if not model_info[ "api_key" ]:
print ( "Warning: API key not found" )
return 0.0
result = evaluate_rubric(
rubric = "Evaluate solution quality..." ,
solution_str = solution_str,
model_info = model_info,
ground_truth = ground_truth,
score_min = 0.0 ,
score_max = 1.0
)
return float (result)
except Exception as e:
print ( f "Error in rubric evaluation: { e } " )
return 0.0
Best Practices
1. Keep API Keys Secure
import os
# Good - use environment variables
API_KEY = os.getenv( "OPENAI_API_KEY" )
# Bad - never hardcode keys
API_KEY = "sk-..." # Don't do this!
2. Choose Appropriate Models
# Example with OpenAI
model = "gpt-5"
# Example with Anthropic
model = "claude-sonnet-4-5"
3. Cache API Calls When Possible
from functools import lru_cache
@lru_cache ( maxsize = 1000 )
def cached_evaluate ( solution_str , ground_truth ):
return evaluate_rubric( ... )
@osmosis_rubric
def cached_rubric ( solution_str : str , ground_truth : str , extra_info : dict , ** kwargs ) -> float :
return cached_evaluate(solution_str, ground_truth)
4. Set Appropriate Score Ranges
# For binary outcomes
score_min = 0.0
score_max = 1.0
# For graded responses
score_min = 0.0
score_max = 10.0
# For percentage grades
score_min = 0.0
score_max = 100.0
Testing Locally
Test your rubrics before deployment:
# test_rubrics.py
from reward_rubric.reward_rubric_openai import compute_rubric_score_openai
def test_correct_answer ():
score = compute_rubric_score_openai(
solution_str = "The answer is 42" ,
ground_truth = "42" ,
extra_info = {}
)
print ( f "Correct answer score: { score } " )
assert 0.8 <= score <= 1.0
def test_incorrect_answer ():
score = compute_rubric_score_openai(
solution_str = "The answer is 100" ,
ground_truth = "42" ,
extra_info = {}
)
print ( f "Incorrect answer score: { score } " )
assert 0.0 <= score <= 0.3
if __name__ == "__main__" :
test_correct_answer()
test_incorrect_answer()
print ( "Tests passed!" )
Next Steps
Setup Guide Complete setup walkthrough
Best Practices Tips and troubleshooting
Python SDK Full API reference
Example Repo See complete examples