Privacy Guidelines
Best practices for handling personal and sensitive data
Privacy-First Approach
ScrapeHub is committed to protecting privacy. These guidelines help you build privacy-respecting scraping workflows and handle personal data responsibly.
Data Privacy Principles
Collect Only What You Need
Extract only the data fields necessary for your specific use case
Encrypt Sensitive Data
Use encryption for personal information both in transit and at rest
Anonymize When Possible
Remove or hash personally identifiable information when not needed
Document Processing
Maintain clear records of what data you collect and why
Identifying Personal Data
Personal data is any information that can identify an individual, directly or indirectly. Common examples include:
Direct Identifiers
Names, email addresses, phone numbers, social security numbers, passport numbers
Online Identifiers
IP addresses, device IDs, cookie identifiers, usernames, account numbers
Biometric Data
Facial recognition data, fingerprints, voice patterns, retinal scans
Location Data
GPS coordinates, home addresses, workplace locations, travel patterns
Financial Information
Bank account numbers, credit card details, income data, transaction history
Data Anonymization Techniques
Pseudonymization
Replace identifying fields with pseudonyms while maintaining data utility:
import hashlib
import uuid
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(api_key="your_api_key")
def pseudonymize_data(data):
"""Replace personal identifiers with pseudonyms"""
# Hash email addresses
if 'email' in data:
email_hash = hashlib.sha256(
data['email'].encode()
).hexdigest()
data['email_id'] = email_hash[:16] # First 16 chars
del data['email']
# Hash names
if 'name' in data:
name_hash = hashlib.sha256(
data['name'].encode()
).hexdigest()
data['user_id'] = name_hash[:16]
del data['name']
# Replace phone numbers with unique IDs
if 'phone' in data:
data['contact_id'] = str(uuid.uuid4())
del data['phone']
return data
# Scrape and pseudonymize
result = client.scrape("https://example.com")
anonymized = pseudonymize_data(result.data)
print(anonymized)
# Output: {'email_id': 'a1b2c3d4e5f6g7h8', 'user_id': '9i0j1k2l3m4n5o6p', ...}Data Masking
def mask_personal_data(data):
"""Mask sensitive portions of personal data"""
# Mask email (show only domain)
if 'email' in data:
email = data['email']
parts = email.split('@')
if len(parts) == 2:
data['email'] = f"****@{parts[1]}"
# Mask phone number (show last 4 digits)
if 'phone' in data:
phone = data['phone']
data['phone'] = f"***-***-{phone[-4:]}"
# Mask credit card (show last 4 digits)
if 'card_number' in data:
card = data['card_number']
data['card_number'] = f"****-****-****-{card[-4:]}"
# Mask address (keep only city and country)
if 'address' in data:
address = data['address']
# Extract city and country only
data['location'] = extract_city_country(address)
del data['address']
return data
result = client.scrape("https://example.com")
masked = mask_personal_data(result.data)
print(masked)
# Output: {'email': '****@example.com', 'phone': '***-***-1234', ...}Aggregation and Generalization
from datetime import datetime
def generalize_data(data):
"""Remove specific details, keep general patterns"""
# Replace exact age with age range
if 'age' in data:
age = data['age']
if age < 18:
data['age_group'] = '0-17'
elif age < 30:
data['age_group'] = '18-29'
elif age < 50:
data['age_group'] = '30-49'
else:
data['age_group'] = '50+'
del data['age']
# Replace exact location with region
if 'city' in data:
city = data['city']
data['region'] = get_region(city) # e.g., "Northeast US"
del data['city']
# Replace exact timestamp with date
if 'timestamp' in data:
ts = datetime.fromisoformat(data['timestamp'])
data['date'] = ts.strftime('%Y-%m-%d') # Remove time
del data['timestamp']
# Replace exact salary with range
if 'salary' in data:
salary = data['salary']
data['salary_band'] = f"{(salary // 10000) * 10000}+"
del data['salary']
return data
result = client.scrape("https://example.com")
generalized = generalize_data(result.data)
print(generalized)
# Output: {'age_group': '30-49', 'region': 'Northeast US', ...}Data Retention and Deletion
Implementing Retention Policies
from datetime import datetime, timedelta
import json
class DataRetentionManager:
def __init__(self, retention_days=90):
self.retention_days = retention_days
def store_with_expiry(self, data, purpose):
"""Store data with automatic expiry"""
expiry_date = datetime.now() + timedelta(days=self.retention_days)
record = {
'data': data,
'collected_at': datetime.now().isoformat(),
'expires_at': expiry_date.isoformat(),
'purpose': purpose,
'status': 'active'
}
# Save to database with expiry
self.save_to_db(record)
return record
def cleanup_expired_data(self):
"""Delete data that has exceeded retention period"""
now = datetime.now()
# Query database for expired records
expired_records = self.query_expired_records(now)
for record in expired_records:
# Securely delete the data
self.secure_delete(record['id'])
print(f"Deleted expired record: {record['id']}")
def secure_delete(self, record_id):
"""Securely delete data beyond recovery"""
# Overwrite data before deletion
# Mark as deleted in audit log
# Remove from all backups
pass
# Usage
manager = DataRetentionManager(retention_days=90)
result = client.scrape("https://example.com")
manager.store_with_expiry(
data=result.data,
purpose="market_research"
)
# Run cleanup job daily
manager.cleanup_expired_data()Responding to Deletion Requests
class DataDeletionHandler:
def handle_deletion_request(self, user_identifier):
"""Process user's right to erasure (GDPR Article 17)"""
# Log the deletion request
self.log_deletion_request(user_identifier)
# Find all data associated with user
user_data = self.find_user_data(user_identifier)
if not user_data:
return {
'status': 'no_data_found',
'message': 'No data found for this identifier'
}
# Delete from primary database
self.delete_from_database(user_identifier)
# Delete from backups
self.delete_from_backups(user_identifier)
# Delete from caches
self.delete_from_cache(user_identifier)
# Delete from analytics
self.delete_from_analytics(user_identifier)
# Send confirmation
return {
'status': 'completed',
'deleted_records': len(user_data),
'timestamp': datetime.now().isoformat(),
'confirmation_id': str(uuid.uuid4())
}
def verify_deletion(self, confirmation_id):
"""Verify that data was successfully deleted"""
# Check all systems for any remaining data
# Return verification report
pass
handler = DataDeletionHandler()
result = handler.handle_deletion_request('user@example.com')
print(f"Deletion completed: {result['confirmation_id']}")Privacy-Preserving Scraping
Selective Data Extraction
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(api_key="your_api_key")
# ❌ Bad: Scrape everything
# result = client.scrape("https://example.com")
# ✅ Good: Scrape only necessary fields
result = client.scrape(
url="https://example.com",
selectors={
# Public product information only
"product_name": ".product-title",
"price": ".product-price",
"category": ".product-category",
# Don't scrape:
# - User reviews with personal opinions
# - Customer names or emails
# - Purchase history
# - Personal recommendations
},
exclude_patterns=[
".user-info",
".customer-reviews",
".personal-data"
]
)
print(result.data) # Only contains non-personal product dataFiltering Sensitive Content
import re
class SensitiveDataFilter:
def __init__(self):
# Regex patterns for common PII
self.patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'phone': r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
'credit_card': r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b',
'ip_address': r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'
}
def filter_pii(self, text):
"""Remove PII from scraped text"""
filtered_text = text
for data_type, pattern in self.patterns.items():
filtered_text = re.sub(
pattern,
f'[{data_type.upper()}_REDACTED]',
filtered_text
)
return filtered_text
def scan_for_pii(self, data):
"""Scan data for potential PII"""
found_pii = {}
for field, value in data.items():
if isinstance(value, str):
for data_type, pattern in self.patterns.items():
matches = re.findall(pattern, value)
if matches:
found_pii[field] = data_type
return found_pii
# Usage
filter_tool = SensitiveDataFilter()
result = client.scrape("https://example.com")
# Scan for PII
pii_found = filter_tool.scan_for_pii(result.data)
if pii_found:
print(f"Warning: Found PII in fields: {pii_found}")
# Filter out PII
for field in result.data:
if isinstance(result.data[field], str):
result.data[field] = filter_tool.filter_pii(result.data[field])
print(result.data)Consent and Transparency
Transparency Requirements
- Clearly identify your scraper with an appropriate User-Agent
- Provide a contact method for website owners (robots.txt, User-Agent)
- Document what data you collect and why
- Inform users if you collect their personal data from public sources
- Maintain a privacy policy explaining your data practices
- Honor opt-out requests promptly
Privacy Impact Assessment
Before starting a new scraping project, conduct a privacy impact assessment:
1. Identify Data Types
What types of data will you collect? Is any of it personal or sensitive?
2. Assess Necessity
Do you really need this data? Can you achieve your goal with less information?
3. Evaluate Risks
What are the privacy risks to individuals if this data is compromised?
4. Implement Safeguards
What technical and organizational measures will protect the data?
5. Document Decisions
Record your assessment, decisions, and rationale for compliance purposes
ScrapeHub Privacy Features
Automatic PII Detection
from scrapehub import ScrapeHubClient
client = ScrapeHubClient(
api_key="your_api_key",
enable_pii_detection=True # Enable automatic PII detection
)
result = client.scrape(
url="https://example.com",
privacy_mode="strict" # strict, moderate, or disabled
)
# ScrapeHub will warn if PII is detected
if result.pii_detected:
print("Warning: PII detected in scraped content")
print(f"Fields with PII: {result.pii_fields}")
print(f"Types detected: {result.pii_types}")
# Option to automatically redact PII
clean_data = result.get_redacted_data()
print(clean_data)
else:
print("No PII detected")
print(result.data)Important Privacy Reminders
- Never scrape data you don't have a legitimate purpose for
- Don't collect more personal data than necessary
- Always secure personal data with encryption
- Delete personal data when it's no longer needed
- Respect data subject rights (access, deletion, correction)
- Maintain detailed logs of data processing activities
- When in doubt, consult with a privacy professional or legal counsel