Custom Rules
Define precise extraction rules and custom scraping logic
About Custom Rules
Custom Rules allow you to define precise extraction logic using CSS selectors, XPath, regular expressions, and custom JavaScript. Perfect when you need full control over the scraping process or when working with complex page structures.
Basic Extraction Rules
CSS Selectors
css_selectors.py
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(api_key="your_api_key")
result = client.scrape(
url="https://example.com",
rules={
"title": {
"selector": "h1.product-title",
"type": "text"
},
"price": {
"selector": ".price-value",
"type": "text",
"transform": "number" # Convert to number
},
"image": {
"selector": "img.product-image",
"attribute": "src" # Extract attribute instead of text
}
}
)
print(result.data)
# {
# "title": "Product Name",
# "price": 99.99,
# "image": "https://example.com/image.jpg"
# }XPath Expressions
xpath_rules.py
result = client.scrape(
url="https://example.com",
rules={
"title": {
"xpath": "//h1[@class='product-title']/text()",
"type": "text"
},
"description": {
"xpath": "//div[@class='description']//p/text()",
"type": "text",
"join": " " # Join multiple matches with space
},
"specs": {
"xpath": "//table[@class='specs']//tr",
"type": "list",
"item": {
"name": "./td[1]/text()",
"value": "./td[2]/text()"
}
}
}
)
print(result.data)Node.js Example
custom_rules.js
const { ScrapeHubClient } = require('@scrapehub/node');
const client = new ScrapeHubClient({
apiKey: process.env.SCRAPEHUB_API_KEY
});
async function scrapeWithRules() {
const result = await client.scrape({
url: 'https://example.com',
rules: {
title: {
selector: 'h1.product-title',
type: 'text'
},
price: {
selector: '.price-value',
type: 'text',
transform: 'number'
}
}
});
console.log(result.data);
}
scrapeWithRules();List Extraction
Extract Multiple Items
list_extraction.py
result = client.scrape(
url="https://example.com/products",
rules={
"products": {
"selector": ".product-card",
"type": "list",
"item": {
"name": {
"selector": ".product-name",
"type": "text"
},
"price": {
"selector": ".price",
"type": "text",
"transform": "number"
},
"url": {
"selector": "a.product-link",
"attribute": "href"
},
"rating": {
"selector": ".rating",
"attribute": "data-rating",
"transform": "number"
}
}
}
}
)
# Access the list
for product in result.data['products']:
print(f"{product['name']}: ${product['price']}")Nested Lists
nested_lists.py
result = client.scrape(
url="https://example.com/categories",
rules={
"categories": {
"selector": ".category",
"type": "list",
"item": {
"name": {
"selector": ".category-name",
"type": "text"
},
"products": {
"selector": ".product",
"type": "list",
"item": {
"name": ".product-name",
"price": ".product-price"
}
}
}
}
}
)
print(result.data)Data Transformations
Built-in Transformers
transformers.py
result = client.scrape(
url="https://example.com",
rules={
"price": {
"selector": ".price",
"transform": "number" # Extract numeric value
},
"date": {
"selector": ".published-date",
"transform": "date" # Parse and normalize date
},
"email": {
"selector": ".contact",
"transform": "email" # Extract email from text
},
"phone": {
"selector": ".phone",
"transform": "phone" # Extract and format phone
},
"url": {
"selector": "a",
"attribute": "href",
"transform": "url" # Convert to absolute URL
},
"text": {
"selector": ".description",
"transform": "trim" # Trim whitespace
}
}
)Regular Expressions
regex_transform.py
result = client.scrape(
url="https://example.com",
rules={
"sku": {
"selector": ".product-info",
"type": "text",
"regex": r"SKU:\s*([A-Z0-9-]+)", # Extract SKU
"regex_group": 1 # Use first capture group
},
"dimensions": {
"selector": ".specs",
"type": "text",
"regex": r"(\d+)\s*x\s*(\d+)\s*x\s*(\d+)",
"regex_group": "all" # Get all groups as list
},
"price_numbers": {
"selector": ".pricing",
"type": "text",
"regex": r"\d+\.\d+", # Find all price numbers
"regex_all": True # Get all matches
}
}
)
print(result.data)Custom JavaScript Transform
custom_transform.py
result = client.scrape(
url="https://example.com",
rules={
"complex_data": {
"selector": ".data-container",
"type": "text",
"custom_transform": """
function(value) {
// Custom transformation logic
const parts = value.split('|');
return {
name: parts[0].trim(),
quantity: parseInt(parts[1]),
available: parts[2] === 'yes'
};
}
"""
}
}
)
print(result.data)Conditional Extraction
Conditional Rules
conditional_rules.py
result = client.scrape(
url="https://example.com",
rules={
"product_type": {
"selector": ".product-type",
"type": "text"
},
"digital_link": {
"selector": ".download-link",
"attribute": "href",
"condition": {
"field": "product_type",
"equals": "digital" # Only extract if digital product
}
},
"shipping_weight": {
"selector": ".weight",
"type": "text",
"condition": {
"field": "product_type",
"equals": "physical"
}
}
}
)
print(result.data)Pagination Rules
pagination_rules.py
result = client.scrape(
url="https://example.com/products",
rules={
"products": {
"selector": ".product",
"type": "list",
"item": {
"name": ".product-name",
"price": ".product-price"
}
}
},
pagination={
"next_selector": "a.next-page", # Next button selector
"max_pages": 10,
"wait_time": 2, # Seconds between pages
"stop_condition": {
"selector": ".no-more-products", # Stop if this appears
"exists": True
}
}
)
# All pages are scraped automatically
print(f"Total products: {len(result.data['products'])}")Advanced Features
Dynamic Content Handling
dynamic_content.py
result = client.scrape(
url="https://example.com",
rules={
"lazy_loaded_images": {
"selector": "img[data-src]",
"attribute": "data-src",
"type": "list",
"wait_for": {
"selector": "img[src]", # Wait until images load
"timeout": 10
}
}
},
javascript={
"enabled": True,
"scroll": True, # Scroll to trigger lazy loading
"wait": 2000 # Wait for content to load
}
)Custom JavaScript Execution
custom_javascript.py
result = client.scrape(
url="https://example.com",
javascript={
"enabled": True,
"before_scrape": """
// Execute before extraction
document.querySelector('.load-more').click();
await new Promise(r => setTimeout(r, 2000));
""",
"custom_extraction": """
// Custom extraction logic
return {
custom_data: window.myAppData,
computed_value: calculateSomething()
};
"""
},
rules={
"title": ".title",
"price": ".price"
}
)Error Handling in Rules
rules_error_handling.py
result = client.scrape(
url="https://example.com",
rules={
"price": {
"selector": ".price",
"type": "text",
"required": True, # Fail if not found
"default": None # Use default if not found (overrides required)
},
"optional_field": {
"selector": ".optional",
"required": False,
"default": "N/A"
},
"fallback_example": {
"selectors": [ # Try multiple selectors
".primary-selector",
".backup-selector",
".last-resort-selector"
],
"type": "text"
}
}
)Rule Templates
Save and reuse common rule configurations:
rule_templates.py
# Save a rule template
template = client.rules.create_template(
name="product-extraction",
rules={
"name": {
"selector": ".product-name",
"type": "text"
},
"price": {
"selector": ".price",
"transform": "number"
},
"rating": {
"selector": ".rating",
"attribute": "data-rating",
"transform": "number"
}
}
)
# Use the template
result = client.scrape(
url="https://example.com",
template="product-extraction" # or template=template.id
)
# List all templates
templates = client.rules.list_templates()
for t in templates:
print(f"{t.name}: {t.id}")Best Practices
Custom Rules Tips
- Use specific selectors to avoid extracting wrong data
- Prefer CSS selectors over XPath for better performance
- Always validate and transform data to expected formats
- Use fallback selectors for resilient extraction
- Test rules on multiple pages before production use
- Create reusable templates for common patterns
- Combine with Neural Engine for more robust extraction
Important Notes
- Custom rules require manual updates when site structure changes
- Complex JavaScript transforms may impact performance
- Always handle missing or optional fields gracefully
- Test extraction rules thoroughly before scaling up