Python is one of the most popular languages for web scraping, and Twitter is one of the most scraped platforms. In this tutorial, you'll learn how to build a robust Twitter scraper from scratch using Python.
By the end of this guide, you'll be able to extract tweets, user profiles, followers, and hashtag data programmatically.
Prerequisites
Before we start, make sure you have:
- Python 3.8+ installed on your system
- pip for package management
- Basic understanding of HTTP requests and HTML
- A code editor (VS Code, PyCharm, etc.)
Choosing Your Approach
There are several ways to scrape Twitter with Python:
1. Official Twitter API
Twitter provides an official API, but it has significant limitations:
- Expensive for commercial use ($100-$5,000+/month)
- Strict rate limits (even on paid tiers)
- Limited historical data access
- Many endpoints restricted to Enterprise tier
2. Web Scraping
Scraping Twitter's web interface directly:
- Free to implement
- Access to all public data
- Requires handling anti-bot measures
- Can break when Twitter updates their site
3. Third-Party APIs
Services like X (Twitter) Scraper API handle the complexity for you:
- Reliable and maintained
- No need to handle anti-bot measures
- Consistent data format
- Cost-effective compared to official API
Project Setup
Let's set up our Python project:
# Create project directory
mkdir twitter-scraper
cd twitter-scraper
# Create virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies
pip install requests beautifulsoup4 pandas
Create a requirements.txt file:
requests==2.31.0
beautifulsoup4==4.12.2
pandas==2.1.0
lxml==4.9.3
Scraping Tweets
Here's a basic example of scraping tweets using a Twitter scraping API:
import requests
import json
class TwitterScraper:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.x-scraper.com/v1"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def search_tweets(self, query, count=100):
"""Search for tweets matching a query."""
endpoint = f"{self.base_url}/search"
params = {
"q": query,
"count": count
}
response = requests.get(
endpoint,
headers=self.headers,
params=params
)
if response.status_code == 200:
return response.json()["tweets"]
else:
raise Exception(f"Error: {response.status_code}")
def get_user_tweets(self, username, count=100):
"""Get tweets from a specific user."""
endpoint = f"{self.base_url}/user/{username}/tweets"
params = {"count": count}
response = requests.get(
endpoint,
headers=self.headers,
params=params
)
if response.status_code == 200:
return response.json()["tweets"]
else:
raise Exception(f"Error: {response.status_code}")
# Usage example
scraper = TwitterScraper("your-api-key")
tweets = scraper.search_tweets("python programming", count=50)
for tweet in tweets:
print(f"@{tweet['username']}: {tweet['text']}")
print(f"Likes: {tweet['likes']} | Retweets: {tweet['retweets']}")
print("-" * 50)
Extracting Tweet Data
Each tweet typically contains these fields:
id- Unique tweet identifiertext- Tweet contentusername- Author's handlecreated_at- Timestamplikes- Like countretweets- Retweet countreplies- Reply countmedia- Attached images/videos
Scraping User Profiles
Here's how to extract user profile information:
def get_user_profile(self, username):
"""Get detailed profile information for a user."""
endpoint = f"{self.base_url}/user/{username}"
response = requests.get(
endpoint,
headers=self.headers
)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"Error: {response.status_code}")
# Usage
profile = scraper.get_user_profile("elonmusk")
print(f"Name: {profile['name']}")
print(f"Bio: {profile['description']}")
print(f"Followers: {profile['followers_count']:,}")
print(f"Following: {profile['following_count']:,}")
Profile Data Fields
name- Display nameusername- Twitter handledescription- Bio textfollowers_count- Number of followersfollowing_count- Number followingtweet_count- Total tweetscreated_at- Account creation dateverified- Verification statuslocation- Profile location
Error Handling
Robust error handling is crucial for production scrapers:
import time
from requests.exceptions import RequestException
class TwitterScraper:
def __init__(self, api_key, max_retries=3):
self.api_key = api_key
self.max_retries = max_retries
# ... other init code
def _make_request(self, endpoint, params=None):
"""Make API request with retry logic."""
for attempt in range(self.max_retries):
try:
response = requests.get(
endpoint,
headers=self.headers,
params=params,
timeout=30
)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
# Rate limited - wait and retry
wait_time = int(
response.headers.get('Retry-After', 60)
)
print(f"Rate limited. Waiting {wait_time}s...")
time.sleep(wait_time)
continue
elif response.status_code == 404:
raise ValueError("Resource not found")
else:
response.raise_for_status()
except RequestException as e:
if attempt == self.max_retries - 1:
raise
print(f"Request failed, retrying... ({e})")
time.sleep(2 ** attempt) # Exponential backoff
raise Exception("Max retries exceeded")
Storing Data
Save your scraped data for analysis:
CSV Export
import pandas as pd
def save_tweets_csv(tweets, filename):
"""Save tweets to CSV file."""
df = pd.DataFrame(tweets)
df.to_csv(filename, index=False)
print(f"Saved {len(tweets)} tweets to {filename}")
# Usage
tweets = scraper.search_tweets("machine learning")
save_tweets_csv(tweets, "ml_tweets.csv")
JSON Export
import json
def save_tweets_json(tweets, filename):
"""Save tweets to JSON file."""
with open(filename, 'w') as f:
json.dump(tweets, f, indent=2, default=str)
print(f"Saved {len(tweets)} tweets to {filename}")
# Usage
save_tweets_json(tweets, "ml_tweets.json")
Database Storage
import sqlite3
def save_to_database(tweets, db_name="tweets.db"):
"""Save tweets to SQLite database."""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
# Create table if not exists
cursor.execute('''
CREATE TABLE IF NOT EXISTS tweets (
id TEXT PRIMARY KEY,
text TEXT,
username TEXT,
created_at TEXT,
likes INTEGER,
retweets INTEGER
)
''')
# Insert tweets
for tweet in tweets:
cursor.execute('''
INSERT OR REPLACE INTO tweets
VALUES (?, ?, ?, ?, ?, ?)
''', (
tweet['id'],
tweet['text'],
tweet['username'],
tweet['created_at'],
tweet['likes'],
tweet['retweets']
))
conn.commit()
conn.close()
print(f"Saved {len(tweets)} tweets to database")
Best Practices
1. Respect Rate Limits
Always implement delays between requests:
import time
def scrape_with_delay(queries, delay=2):
"""Scrape multiple queries with delay."""
results = []
for query in queries:
tweets = scraper.search_tweets(query)
results.extend(tweets)
time.sleep(delay) # Wait between requests
return results
2. Use Proper Headers
Set appropriate headers to identify your scraper:
headers = {
"User-Agent": "MyTwitterScraper/1.0 (contact@example.com)",
"Accept": "application/json"
}
3. Handle Pagination
Most APIs return paginated results:
def get_all_tweets(self, query, max_tweets=1000):
"""Get all tweets with pagination."""
all_tweets = []
cursor = None
while len(all_tweets) < max_tweets:
params = {"q": query, "count": 100}
if cursor:
params["cursor"] = cursor
data = self._make_request(
f"{self.base_url}/search",
params
)
tweets = data.get("tweets", [])
if not tweets:
break
all_tweets.extend(tweets)
cursor = data.get("next_cursor")
if not cursor:
break
return all_tweets[:max_tweets]
4. Log Everything
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraper.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
5. Validate Data
def validate_tweet(tweet):
"""Ensure tweet has required fields."""
required = ['id', 'text', 'username', 'created_at']
return all(field in tweet for field in required)
Skip the Complexity
Building and maintaining a Twitter scraper is time-consuming. Our API handles rate limits, anti-bot measures, and data formatting for you.
Try X (Twitter) Scraper APIComplete Example
Here's a complete, production-ready scraper:
"""
Twitter Scraper - Complete Example
"""
import requests
import pandas as pd
import time
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class TwitterScraper:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api.x-scraper.com/v1"
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
})
def search(self, query, count=100):
"""Search tweets with error handling."""
try:
response = self.session.get(
f"{self.base_url}/search",
params={"q": query, "count": count},
timeout=30
)
response.raise_for_status()
return response.json().get("tweets", [])
except Exception as e:
logger.error(f"Search failed: {e}")
return []
def export_csv(self, tweets, filename):
"""Export tweets to CSV."""
df = pd.DataFrame(tweets)
df.to_csv(filename, index=False)
logger.info(f"Exported {len(tweets)} tweets to {filename}")
# Main execution
if __name__ == "__main__":
scraper = TwitterScraper("your-api-key")
# Search for tweets
tweets = scraper.search("artificial intelligence", count=200)
# Export results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
scraper.export_csv(tweets, f"ai_tweets_{timestamp}.csv")
print(f"Done! Found {len(tweets)} tweets.")
Conclusion
You now have the knowledge to build a Twitter scraper in Python. Key takeaways:
- Choose the right approach based on your needs (API vs scraping vs third-party)
- Implement robust error handling and retry logic
- Respect rate limits to avoid getting blocked
- Store data in appropriate formats for your use case
- Log everything for debugging and monitoring
For production use cases where reliability matters, consider using a managed solution like X (Twitter) Scraper API to save development time and ensure consistent results.