wpban/includes/class-wpban-ai-crawlers.php
2025-03-10 12:38:31 +08:00

123 lines
9.9 KiB
PHP

<?php
namespace WPBan_Anything;
if (!defined('ABSPATH')) {
exit;
}
class WPBan_AI_Crawlers {
private $ai_crawlers;
public function __construct() {
$this->ai_crawlers = $this->get_default_crawlers();
add_filter('robots_txt', [$this, 'add_ai_robots_txt'], 10, 2);
add_action('wp_head', [$this, 'add_meta_tag']);
add_action('init', [$this, 'monitor_crawlers']);
}
private function get_default_crawlers() {
return [
'AI2Bot' => ['description' => __('Explores sites for web content that is used to train open language models', 'wpban-anything'), 'link' => 'https://allenai.org/crawler'],
'Ai2Bot-Dolma' => ['description' => __('Generates data sets used to train open language models', 'wpban-anything'), 'link' => 'https://allenai.org/dolma'],
'AmazonBot' => ['description' => __('Used by Amazon\'s Alexa AI to provide AI answers.', 'wpban-anything'), 'link' => 'https://developer.amazon.com/amazonbot'],
'Applebot-Extended' => ['description' => __('Used by Apple for generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools.', 'wpban-anything'), 'link' => 'https://support.apple.com/en-us/119829'],
'anthropic-ai' => ['description' => __('Used by Anthropic\'s Claude.', 'wpban-anything'), 'link' => 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler'],
'Bytespider' => ['description' => __('Used by TikTok for AI training.', 'wpban-anything'), 'link' => 'https://darkvisitors.com/agents/bytespider'],
'CCBot' => ['description' => __('Compiles datasets used to train AI models.', 'wpban-anything'), 'link' => 'https://commoncrawl.org/big-picture/frequently-asked-questions/'],
'ChatGPT-User' => ['description' => __('Used by OpenAI to power ChatGPT.', 'wpban-anything'), 'link' => 'https://platform.openai.com/docs/plugins/bot'],
'ClaudeBot' => ['description' => __('Used by Anthropic\'s Claude.', 'wpban-anything'), 'link' => 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler'],
'Claude-Web' => ['description' => __('Used by Anthropic\'s Claude.', 'wpban-anything'), 'link' => 'https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler'],
'cohere-ai' => ['description' => __('Used by Cohere to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://cohere.com/about'],
'cohere-training-data-crawler' => ['description' => __('Used by Cohere to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://cohere.com/about'],
'Crawlspace' => ['description' => __('A web scraper that can be used to extract data for AI training.', 'wpban-anything'), 'link' => 'https://crawlspace.dev/'],
'Diffbot' => ['description' => __('Used by Diffbot to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://docs.diffbot.com/reference/crawl-introduction'],
'FacebookBot' => ['description' => __('Used by Meta (Facebook) for their AI.', 'wpban-anything'), 'link' => 'https://developers.facebook.com/docs/sharing/bot'],
'FriendlyCrawler' => ['description' => __('Crawls websites to build datasets for machine learning experiments.', 'wpban-anything'), 'link' => 'https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler/'],
'GPTBot' => ['description' => __('Used by OpenAI to power ChatGPT.', 'wpban-anything'), 'link' => 'https://platform.openai.com/docs/bots'],
'Google-Extended' => ['description' => __('Used by Google to power Gemini (formerly known as Bard).', 'wpban-anything'), 'link' => 'https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers'],
'ImagesiftBot' => ['description' => __('Used by Hive\'s Imagesift tool that scrapes images. This may be used for the company\'s generative AI product.', 'wpban-anything'), 'link' => 'https://imagesift.com/about'],
'Kangaroo Bot' => ['description' => __('Used to power the Australia-focused Kangaroo LLM.', 'wpban-anything'), 'link' => 'https://kangaroollm.com.au/kangaroo-bot/'],
'Meta-ExternalAgent' => ['description' => __('Used by Meta (Facebook) to train AI products.', 'wpban-anything'), 'link' => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers'],
'Meta-ExternalFetcher' => ['description' => __('Used by Meta (Facebook) to train AI products.', 'wpban-anything'), 'link' => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers'],
'OAI-SearchBot' => ['description' => __('Used by OpenAI for their SearchGPT product.', 'wpban-anything'), 'link' => 'https://platform.openai.com/docs/bots'],
'Omgili' => ['description' => __('Used by Omgili to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://webz.io/blog/machine-learning/common-crawl-vs-webz-io-data'],
'Omgilibot' => ['description' => __('Used by Omgili to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://webz.io/blog/machine-learning/common-crawl-vs-webz-io-data'],
'PanguBot' => ['description' => __('Used by Huawei to download data for the Large Language Model (LLM) called PanGu.', 'wpban-anything'), 'link' => 'https://darkvisitors.com/agents/pangubot'],
'PetalBot' => ['description' => __('Used by Huawei to scrape data for AI training.', 'wpban-anything'), 'link' => 'https://darkvisitors.com/agents/petalbot'],
'PerplexityBot' => ['description' => __('Used by Perplexity for their AI products.', 'wpban-anything'), 'link' => 'https://docs.perplexity.ai/docs/perplexitybot'],
'Scrapy' => ['description' => __('Blocks the Scrapy bot (used for scraping websites).', 'wpban-anything'), 'link' => 'https://scrapy.org/'],
'SemrushBot' => ['description' => __('Blocks the Semrush bot used to pull data into the Semrush platform. Data is used for their ContentShake AI tool.', 'wpban-anything'), 'link' => 'https://www.semrush.com/bot/'],
'SemrushBot-OCOB' => ['description' => __('Blocks the Semrush bot used to pull data into the Semrush platform.', 'wpban-anything'), 'link' => 'https://www.semrush.com/bot/'],
'SemrushBot-FT' => ['description' => __('Blocks the Semrush bot used to pull data into the Semrush platform.', 'wpban-anything'), 'link' => 'https://www.semrush.com/bot/'],
'SentiBot' => ['description' => __('Used for sentiment analysis and AI training.', 'wpban-anything'), 'link' => 'https://darkvisitors.com/agents/sentibot'],
'sentibot' => ['description' => __('Used for sentiment analysis and AI training.', 'wpban-anything'), 'link' => 'https://darkvisitors.com/agents/sentibot'],
'Timpibot' => ['description' => __('Used by Timpi; likely for their Wilson AI Product.', 'wpban-anything'), 'link' => 'https://timpi.io/wilson-ai/'],
'TurnitinBot' => ['description' => __('Used by Turnitin to scrape data for plagiarism detection.', 'wpban-anything'), 'link' => 'https://www.turnitin.com/robot/crawlerinfo.html'],
'YouBot' => ['description' => __('Used by You.com to train AI products.', 'wpban-anything'), 'link' => 'https://about.you.com/es/youbot/'],
'webzio' => ['description' => __('Used by Webz.io for their social listening and intelligence platforms.', 'wpban-anything'), 'link' => 'https://webz.io/bot.html'],
'webzio-extended' => ['description' => __('Used by Webz.io for AI training.', 'wpban-anything'), 'link' => 'https://webz.io/bot.html']
];
}
public function get_settings() {
return get_option('ai_crawler_settings', [
'enabled_crawlers' => array_keys($this->ai_crawlers)
]);
}
public function save_settings($data) {
$settings = [
'enabled_crawlers' => isset($data['ai_crawlers']) ? array_map('sanitize_text_field', $data['ai_crawlers']) : []
];
update_option('ai_crawler_settings', $settings);
return $settings;
}
public function add_ai_robots_txt($robots, $public) {
$settings = $this->get_settings();
$robots .= "\n# WPBan-Anything AI Crawler Blocks\n";
foreach ($settings['enabled_crawlers'] as $crawler) {
if (array_key_exists($crawler, $this->ai_crawlers)) {
$robots .= "User-agent: $crawler\nDisallow: /\n";
}
}
$robots .= "# End WPBan-Anything AI Crawler Blocks\n";
return $robots;
}
public function add_meta_tag() {
echo '<meta name="robots" content="noai, noimageai" />';
}
public function get_crawler_list() {
return $this->ai_crawlers;
}
public function monitor_crawlers() {
$ua = $_SERVER['HTTP_USER_AGENT'] ?? '';
$settings = $this->get_settings();
foreach ($settings['enabled_crawlers'] as $crawler) {
if (stripos($ua, $crawler) !== false) {
$stats = get_option('banned_stats', [
'users' => [],
'total_count' => 0,
'last_ban_time' => null,
'types' => ['ip_ban' => 0, 'login_restriction' => 0, 'wechat_qq_block' => 0, 'ai_crawler_block' => 0, 'seo_crawler_block' => 0]
]);
$ip = ban_anything_get_ip();
$stats['users'][$ip] = [
'count' => ($stats['users'][$ip]['count'] ?? 0) + 1,
'last_time' => current_time('mysql'),
'type' => 'ai_crawler_block'
];
$stats['total_count']++;
$stats['last_ban_time'] = current_time('mysql');
$stats['types']['ai_crawler_block']++;
update_option('banned_stats', $stats);
break;
}
}
}
}