E-Commerce Scraping Guide
Extract product data, prices, and inventory from online stores
What You'll Learn
- Scrape product catalogs and listings
- Extract prices, ratings, and reviews
- Handle pagination and dynamic content
- Monitor inventory and price changes
- Export data for analysis
Basic Product Scraping
scrape_products.py
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Scrape product listing page
result = client.scrape(
url="https://example-shop.com/products",
engine="neural-x1",
format="json"
)
# Process products
for product in result.data:
print(f"Product: {product['name']}")
print(f"Price: ${product['price']}")
print(f"Rating: {product['rating']}/5")
print(f"In Stock: {product['in_stock']}")
print(f"URL: {product['url']}")
print("-" * 50)Multi-Page Scraping
scrape_with_pagination.py
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# Scrape multiple pages automatically
result = client.scrape(
url="https://example-shop.com/products",
engine="neural-x1",
pagination={
"enabled": True,
"max_pages": 50,
"selector": "a.next-page" # CSS selector for "Next" button
}
)
print(f"Total pages scraped: {result.pages_scraped}")
print(f"Total products found: {len(result.data)}")
# Save to CSV for analysis
import pandas as pd
df = pd.DataFrame(result.data)
df.to_csv('products.csv', index=False)
print("Data exported to products.csv")Scraping Product Details
scrape_product_details.py
from scrapehub import ScrapeHubClient
import pandas as pd
client = ScrapeHubClient(api_key="sk_live_xxxx_449x")
# First, get product listing
listing_result = client.scrape(
url="https://example-shop.com/category/electronics",
engine="neural-x1"
)
# Then scrape each product detail page
detailed_products = []
for product in listing_result.data[:10]: # First 10 products
detail_result = client.scrape(
url=product['url'],
engine="neural-x1",
render_js=True,
wait_for_selector=".product-details"
)
if detail_result.data:
detailed_products.append(detail_result.data[0])
print(f"Scraped: {product['name']}")
# Export detailed data
df = pd.DataFrame(detailed_products)
df.to_excel('detailed_products.xlsx', index=False)
print(f"Exported {len(detailed_products)} detailed product records")Price Monitoring System
price_monitor.py
from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime
class PriceMonitor:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
self.history_file = 'price_history.csv'
def check_prices(self, urls):
"""Check current prices for list of product URLs"""
results = []
for url in urls:
result = self.client.scrape(
url=url,
engine="neural-x1"
)
if result.data:
product = result.data[0]
results.append({
'timestamp': datetime.now(),
'url': url,
'name': product.get('name'),
'price': product.get('price'),
'in_stock': product.get('in_stock', True)
})
return pd.DataFrame(results)
def save_history(self, new_data):
"""Append new price data to history"""
try:
history = pd.read_csv(self.history_file)
history = pd.concat([history, new_data], ignore_index=True)
except FileNotFoundError:
history = new_data
history.to_csv(self.history_file, index=False)
print(f"Saved {len(new_data)} price records")
def get_price_changes(self):
"""Detect price changes since last check"""
history = pd.read_csv(self.history_file)
history['timestamp'] = pd.to_datetime(history['timestamp'])
# Get latest prices for each URL
latest = history.sort_values('timestamp').groupby('url').tail(2)
changes = []
for url in latest['url'].unique():
url_data = latest[latest['url'] == url]
if len(url_data) == 2:
old_price = url_data.iloc[0]['price']
new_price = url_data.iloc[1]['price']
if old_price != new_price:
changes.append({
'name': url_data.iloc[1]['name'],
'url': url,
'old_price': old_price,
'new_price': new_price,
'change': new_price - old_price,
'percent_change': ((new_price - old_price) / old_price) * 100
})
return pd.DataFrame(changes)
# Usage
monitor = PriceMonitor("sk_live_xxxx_449x")
# Products to monitor
urls = [
"https://example-shop.com/product/abc",
"https://example-shop.com/product/xyz",
"https://example-shop.com/product/123"
]
# Check current prices
current_prices = monitor.check_prices(urls)
monitor.save_history(current_prices)
# Detect changes
changes = monitor.get_price_changes()
if not changes.empty:
print("\nPrice Changes Detected:")
for _, change in changes.iterrows():
print(f"\n{change['name']}")
print(f" Old: $${change['old_price']:.2f}")
print(f" New: $${change['new_price']:.2f}")
print(f" Change: $${change['change']:.2f} ({change['percent_change']:.1f}%)")Competitor Analysis
competitor_analysis.py
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd
async def analyze_competitors():
client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")
# Competitor URLs for same product category
competitors = {
'Store A': 'https://store-a.com/category/laptops',
'Store B': 'https://store-b.com/laptops',
'Store C': 'https://store-c.com/computers/laptops'
}
# Scrape all competitors concurrently
tasks = []
for store_name, url in competitors.items():
task = client.scrape(
url=url,
engine="neural-x1",
pagination={"enabled": True, "max_pages": 5}
)
tasks.append((store_name, task))
# Collect results
all_products = []
for store_name, task in tasks:
result = await task
for product in result.data:
product['store'] = store_name
all_products.append(product)
print(f"{store_name}: {len(result.data)} products")
return pd.DataFrame(all_products)
# Run analysis
df = asyncio.run(analyze_competitors())
# Price comparison
print("\nPrice Statistics by Store:")
print(df.groupby('store')['price'].describe())
# Find cheapest options
print("\n=== Cheapest Products ===")
for product_name in df['name'].unique()[:5]:
product_df = df[df['name'] == product_name]
cheapest = product_df.loc[product_df['price'].idxmin()]
print(f"\n{product_name}")
print(f" Cheapest at: {cheapest['store']} - ${cheapest['price']}")
# Export for further analysis
df.to_excel('competitor_analysis.xlsx', index=False)
print("\nFull analysis exported to competitor_analysis.xlsx")Automated Inventory Monitoring
inventory_monitor.py
from scrapehub import ScrapeHubClient
import smtplib
from email.mime.text import MIMEText
from datetime import datetime
class InventoryMonitor:
def __init__(self, api_key):
self.client = ScrapeHubClient(api_key=api_key)
def check_availability(self, products):
"""Check if products are in stock"""
alerts = []
for product in products:
result = self.client.scrape(
url=product['url'],
engine="neural-x1"
)
if result.data:
item = result.data[0]
in_stock = item.get('in_stock', False)
if in_stock and not product.get('was_in_stock', False):
# Product back in stock!
alerts.append({
'type': 'back_in_stock',
'name': item['name'],
'price': item['price'],
'url': product['url']
})
# Update status
product['was_in_stock'] = in_stock
product['last_checked'] = datetime.now()
return alerts
def send_alert(self, alerts, email_to):
"""Send email notification for stock alerts"""
if not alerts:
return
message = "Stock Alerts:\n\n"
for alert in alerts:
message += f"✅ {alert['name']} is back in stock!\n"
message += f" Price: ${alert['price']}\n"
message += f" URL: {alert['url']}\n\n"
msg = MIMEText(message)
msg['Subject'] = f"Stock Alert: {len(alerts)} items available"
msg['From'] = "alerts@yourdomain.com"
msg['To'] = email_to
# Send email (configure SMTP settings)
# smtp = smtplib.SMTP('smtp.gmail.com', 587)
# smtp.starttls()
# smtp.login('your_email', 'your_password')
# smtp.send_message(msg)
# smtp.quit()
print(f"Alert sent: {len(alerts)} items back in stock")
# Usage
monitor = InventoryMonitor("sk_live_xxxx_449x")
# Products to watch
watchlist = [
{'url': 'https://example-shop.com/product/limited-edition-1', 'was_in_stock': False},
{'url': 'https://example-shop.com/product/limited-edition-2', 'was_in_stock': False},
]
# Check availability
alerts = monitor.check_availability(watchlist)
# Send notifications
if alerts:
monitor.send_alert(alerts, "your-email@example.com")
print(f"Found {len(alerts)} products back in stock!")
else:
print("No new items in stock")Best Practices
- Use pagination to scrape entire catalogs efficiently
- Enable JavaScript rendering for dynamic price updates
- Set up webhooks for long-running scrape jobs
- Schedule regular checks to monitor price changes
- Respect rate limits - use async jobs for bulk operations
- Store historical data for trend analysis
Common Use Cases
Price Intelligence
- Track competitor pricing
- Dynamic pricing strategies
- Historical price trends
Inventory Management
- Stock availability monitoring
- Restock alerts
- Product lifecycle tracking
Market Research
- Product catalog analysis
- Category trends
- Brand monitoring
Review Analysis
- Customer sentiment
- Rating aggregation
- Competitor comparison