В robots.txt можно вписать перечень нежелательных юзерагентов, например:
User-Agent: DISCo Pump, Wget, WebZIP, Teleport Pro, WebSnake, Offline Explorer, Web-By-Mail, Teleport Pro/1.29, Scooter-W3-1.0
Disallow: /
Этот метод не закрывает доступ для них, а лишь регламентирует его.
Для полного закрытия доступа, к примеру, под Apache, можно использовать .htaccess
<IfModule mod_rewrite.c>
RewriteEngine On
RewriteBase /
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_FILENAME} !-d
RewriteRule ^(.*)$ /index.php/$1 [L]
RewriteCond %{REQUEST_URI} !^/403.php$
RewriteCond %{HTTP_USER_AGENT} .*Ask\sJeeves.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*HP\sWeb\sPrintSmart.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*HTTrack.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*IDBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Indy\sLibrary.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ListChecker.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*MSIECrawler.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*NetCache.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Nutch.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*rulinki\.ru.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Twiceler.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Webster\sPro.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*www\.cys\.ru.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Wysigot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Yeti.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Accoona.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*CazoodleBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*CFNetwork.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ConveraCrawler.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*DISCo.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Download\sMaster.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*FAST\sMetaWeb\sCrawler.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Flexum\sspider$ [OR]
RewriteCond %{HTTP_USER_AGENT} .*HTMLParser.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ia_archiver.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ichiro.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*IRLbot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Java.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*km\.ru\sbot$ [OR]
RewriteCond %{HTTP_USER_AGENT} .*kmSearchBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*libwww-perl.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Lupa\.ru.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*LWP::Simple.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*lwp-trivial.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Missigua.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*MJ12bot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Offline\sExplorer.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*OmniExplorer_Bot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*PEAR.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*psbot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Python.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*rulinki\.ru.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*SMILE.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Speedy.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Teleport\sPro.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*TurtleScanner.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*voyager.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*WebCopier.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*WebData.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*WebZIP.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Wget.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Yanga.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*igdeSpyder.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*DotBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*larbin.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Exabot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ovalebot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*OOZBOT.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Baiduspider.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*AportWorm.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Dolphin/.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*www\.archive\.org.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ZangoToolbar.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*www\.sogou\.com.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*pango-text.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Snoopy.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*panscient\.com.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*www\.trueoffice\.ru.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*Tagoobot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*SiteBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*TurnitinBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*HTTPClient.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*ezooms\.bot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*SolomonoBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*YodaoBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*discobot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*AhrefsBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*magpie-crawler.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*sistrix\.net.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*KomodiaBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} .*AcoonBot.* [OR]
RewriteCond %{HTTP_USER_AGENT} ^$
RewriteRule .* /index.php [F]
</IfModule>
Регулярно проверяя логи или сводку статистики сайта - пополняем список нежелательных “индексаторов”...