E-Commerce Scraping Guide

Extract product data, prices, and inventory from online stores

What You'll Learn

Scrape product catalogs and listings
Extract prices, ratings, and reviews
Handle pagination and dynamic content
Monitor inventory and price changes
Export data for analysis

Basic Product Scraping

scrape_products.py

python

from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape product listing page
result = client.scrape(
    url="https://example-shop.com/products",
    engine="neural-x1",
    format="json"
)

# Process products
for product in result.data:
    print(f"Product: {product['name']}")
    print(f"Price: ${product['price']}")
    print(f"Rating: {product['rating']}/5")
    print(f"In Stock: {product['in_stock']}")
    print(f"URL: {product['url']}")
    print("-" * 50)

Multi-Page Scraping

scrape_with_pagination.py

python

from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape multiple pages automatically
result = client.scrape(
    url="https://example-shop.com/products",
    engine="neural-x1",
    pagination={
        "enabled": True,
        "max_pages": 50,
        "selector": "a.next-page"  # CSS selector for "Next" button
    }
)

print(f"Total pages scraped: {result.pages_scraped}")
print(f"Total products found: {len(result.data)}")

# Save to CSV for analysis
import pandas as pd

df = pd.DataFrame(result.data)
df.to_csv('products.csv', index=False)
print("Data exported to products.csv")

Scraping Product Details

scrape_product_details.py

python

from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# First, get product listing
listing_result = client.scrape(
    url="https://example-shop.com/category/electronics",
    engine="neural-x1"
)

# Then scrape each product detail page
detailed_products = []

for product in listing_result.data[:10]:  # First 10 products
    detail_result = client.scrape(
        url=product['url'],
        engine="neural-x1",
        render_js=True,
        wait_for_selector=".product-details"
    )

    if detail_result.data:
        detailed_products.append(detail_result.data[0])

    print(f"Scraped: {product['name']}")

# Export detailed data
df = pd.DataFrame(detailed_products)
df.to_excel('detailed_products.xlsx', index=False)
print(f"Exported {len(detailed_products)} detailed product records")

Price Monitoring System

price_monitor.py

python

from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime

class PriceMonitor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)
        self.history_file = 'price_history.csv'

    def check_prices(self, urls):
        """Check current prices for list of product URLs"""
        results = []

        for url in urls:
            result = self.client.scrape(
                url=url,
                engine="neural-x1"
            )

            if result.data:
                product = result.data[0]
                results.append({
                    'timestamp': datetime.now(),
                    'url': url,
                    'name': product.get('name'),
                    'price': product.get('price'),
                    'in_stock': product.get('in_stock', True)
                })

        return pd.DataFrame(results)

    def save_history(self, new_data):
        """Append new price data to history"""
        try:
            history = pd.read_csv(self.history_file)
            history = pd.concat([history, new_data], ignore_index=True)
        except FileNotFoundError:
            history = new_data

        history.to_csv(self.history_file, index=False)
        print(f"Saved {len(new_data)} price records")

    def get_price_changes(self):
        """Detect price changes since last check"""
        history = pd.read_csv(self.history_file)
        history['timestamp'] = pd.to_datetime(history['timestamp'])

        # Get latest prices for each URL
        latest = history.sort_values('timestamp').groupby('url').tail(2)

        changes = []
        for url in latest['url'].unique():
            url_data = latest[latest['url'] == url]
            if len(url_data) == 2:
                old_price = url_data.iloc[0]['price']
                new_price = url_data.iloc[1]['price']

                if old_price != new_price:
                    changes.append({
                        'name': url_data.iloc[1]['name'],
                        'url': url,
                        'old_price': old_price,
                        'new_price': new_price,
                        'change': new_price - old_price,
                        'percent_change': ((new_price - old_price) / old_price) * 100
                    })

        return pd.DataFrame(changes)

# Usage
monitor = PriceMonitor("sk_live_xxxx_449x")

# Products to monitor
urls = [
    "https://example-shop.com/product/abc",
    "https://example-shop.com/product/xyz",
    "https://example-shop.com/product/123"
]

# Check current prices
current_prices = monitor.check_prices(urls)
monitor.save_history(current_prices)

# Detect changes
changes = monitor.get_price_changes()
if not changes.empty:
    print("\nPrice Changes Detected:")
    for _, change in changes.iterrows():
        print(f"\n{change['name']}")
        print(f"  Old: $${change['old_price']:.2f}")
        print(f"  New: $${change['new_price']:.2f}")
        print(f"  Change: $${change['change']:.2f} ({change['percent_change']:.1f}%)")

Competitor Analysis

competitor_analysis.py

python

from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def analyze_competitors():
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Competitor URLs for same product category
    competitors = {
        'Store A': 'https://store-a.com/category/laptops',
        'Store B': 'https://store-b.com/laptops',
        'Store C': 'https://store-c.com/computers/laptops'
    }

    # Scrape all competitors concurrently
    tasks = []
    for store_name, url in competitors.items():
        task = client.scrape(
            url=url,
            engine="neural-x1",
            pagination={"enabled": True, "max_pages": 5}
        )
        tasks.append((store_name, task))

    # Collect results
    all_products = []

    for store_name, task in tasks:
        result = await task
        for product in result.data:
            product['store'] = store_name
            all_products.append(product)

        print(f"{store_name}: {len(result.data)} products")

    return pd.DataFrame(all_products)

# Run analysis
df = asyncio.run(analyze_competitors())

# Price comparison
print("\nPrice Statistics by Store:")
print(df.groupby('store')['price'].describe())

# Find cheapest options
print("\n=== Cheapest Products ===")
for product_name in df['name'].unique()[:5]:
    product_df = df[df['name'] == product_name]
    cheapest = product_df.loc[product_df['price'].idxmin()]
    print(f"\n{product_name}")
    print(f"  Cheapest at: {cheapest['store']} - ${cheapest['price']}")

# Export for further analysis
df.to_excel('competitor_analysis.xlsx', index=False)
print("\nFull analysis exported to competitor_analysis.xlsx")

Automated Inventory Monitoring

inventory_monitor.py

python

from scrapehub import ScrapeHubClient
import smtplib
from email.mime.text import MIMEText
from datetime import datetime

class InventoryMonitor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def check_availability(self, products):
        """Check if products are in stock"""
        alerts = []

        for product in products:
            result = self.client.scrape(
                url=product['url'],
                engine="neural-x1"
            )

            if result.data:
                item = result.data[0]
                in_stock = item.get('in_stock', False)

                if in_stock and not product.get('was_in_stock', False):
                    # Product back in stock!
                    alerts.append({
                        'type': 'back_in_stock',
                        'name': item['name'],
                        'price': item['price'],
                        'url': product['url']
                    })

                # Update status
                product['was_in_stock'] = in_stock
                product['last_checked'] = datetime.now()

        return alerts

    def send_alert(self, alerts, email_to):
        """Send email notification for stock alerts"""
        if not alerts:
            return

        message = "Stock Alerts:\n\n"
        for alert in alerts:
            message += f"✅ {alert['name']} is back in stock!\n"
            message += f"   Price: ${alert['price']}\n"
            message += f"   URL: {alert['url']}\n\n"

        msg = MIMEText(message)
        msg['Subject'] = f"Stock Alert: {len(alerts)} items available"
        msg['From'] = "alerts@yourdomain.com"
        msg['To'] = email_to

        # Send email (configure SMTP settings)
        # smtp = smtplib.SMTP('smtp.gmail.com', 587)
        # smtp.starttls()
        # smtp.login('your_email', 'your_password')
        # smtp.send_message(msg)
        # smtp.quit()

        print(f"Alert sent: {len(alerts)} items back in stock")

# Usage
monitor = InventoryMonitor("sk_live_xxxx_449x")

# Products to watch
watchlist = [
    {'url': 'https://example-shop.com/product/limited-edition-1', 'was_in_stock': False},
    {'url': 'https://example-shop.com/product/limited-edition-2', 'was_in_stock': False},
]

# Check availability
alerts = monitor.check_availability(watchlist)

# Send notifications
if alerts:
    monitor.send_alert(alerts, "your-email@example.com")
    print(f"Found {len(alerts)} products back in stock!")
else:
    print("No new items in stock")

Best Practices

Use pagination to scrape entire catalogs efficiently
Enable JavaScript rendering for dynamic price updates
Set up webhooks for long-running scrape jobs
Schedule regular checks to monitor price changes
Respect rate limits - use async jobs for bulk operations
Store historical data for trend analysis

Common Use Cases

Price Intelligence

Track competitor pricing
Dynamic pricing strategies
Historical price trends

Inventory Management

Stock availability monitoring
Restock alerts
Product lifecycle tracking

Market Research

Product catalog analysis
Category trends
Brand monitoring

Review Analysis

Customer sentiment
Rating aggregation
Competitor comparison