# robots.txt for www.smythies.com # # A note to wordpress users that fetch this file as # an example via the "Multipart robots.txt editor" # plugin. # I deny a lot of bots, you might want to re-consider # for your application. # Also, this file is only one step in my access management. # I also use: # Apache rewrite rules, i.e. for bots that ignore this file. # Direct iptables DROP rules for annoying crawlers without an # otherwise uniquely identifiable user agent string. # # # robots.txt 2024.04.06 # dissallow Friendly_Crawler # robots.txt 2024.04.03 # dissallow FriendlyCrawler # robots.txt 2023.01.24 # dissallow woorankreview # dissallow MaCoCu # dissallow AwarioBot # dissallow okhttp # dissallow Bytespider # dissallow SenutoBot # dissallow OWLer # dissallow FunnelBack # dissallow TinyTestBot # dissallow Cincraw # dissallow HubSpot # dissallow fidget-spinner-bot # dissallow ClaudeBot # # robots.txt 2022.04.21 # dissallow Keybot Translation-Search-Machine # dissallow INETDEX-BOT # dissallow YaK # dissallow MixrankBot # dissallow SERPtimizerBot # dissallow vuhuvBot # dissallow Amazonbot # dissallow IonCrawl # dissallow SeekportBot # dissallow Jambot # dissallow Diffbot # dissallow SeekBot # dissallow Twitterbot # dissallow webprosbot # # robots.txt 2022.02.27 # dissallow .mp4 # # robots.txt 2021.09.29 # dissallow: User-agent: clark-crawler # requires its own directive, or doesn't work: # dissallow: User-agent: DataForSeoBot # dissallow: SurdotlyBot # dissallow: DomainStatsBot # dissallow: FlfBaldrBot # # robots.txt 2021.06.15 # dissallow: User-agent: InfoTigerBot # Is this just a new name for the 2021.05.17 entry? # # robots.txt 2021.05.17 # dissallow: User-agent: infotiger # They do not actually set their user agent, but # they do read robots.txt, although not often. # # robots.txt 2021.05.14 # I suspect bingbot is ignoring it's disallow # and using the later old MSNbot stuff. Try # setting MSNbot to disallow. # # robots.txt 2021.05.06 # dissallow: User-agent: Neevabot # dissallow: User-agent: Linguee # # robots.txt 2021.03.26 # dissallow: User-agent: SemanticScholarBot # # robots.txt 2020.12.28 # dissallow: User-agent: niocBot # dissallow: User-agent: PetalBot # # robots.txt 2020.07.23 # dissallow: User-agent: oBot # # robots.txt 2020.07.19 # restrict some additional sub directories # where I file junk for others and myself. # under ~doug/linux/s18/hwp # # dissallow: User-agent: JobboerseBot # dissallow: User-agent: CheckMarkNetwork # # robots.txt 2018.10.30 # dissallow: User-agent: bbot # dissallow: User-agent: brands-bot-logo # # robots.txt 2018.10.30 # dissallow: User-agent: oBot # # robots.txt 2018.10.30 # dissallow: User-agent: serpstatbot # dissallow: User-agent: Datanyze # dissallow: User-agent: IndeedBot # dissallow: User-agent: Experibot # dissallow: User-agent: Seekport # dissallow: User-agent: Clarabot # not sure if Datanyze checks robots.txt # # robots.txt 2018.08.06 # add .svg files to disallow lists. # # robots.txt 2018.05.09 # disallow: User-agent: dataprovider # disallow: User-agent: crawler4j # disallow: User-agent: ExtLinksBot # disallow: User-agent: The Knowledge AI # Add more version of Sogou crawlers # # robots.txt 2018.04.11 # I often post .csv files and .data files for my # experiments. Add to disallow list # # robots.txt 2018.04.10 # disallow: User-agent: MauiBot # disallow: User-agent: DAUM # # robots.txt 2017.12.02 # Change to multiple User-agent: lines per disallow, as # specified in the original robots.txt specifications. # disallow: User-agent: SEOkicks-Robot # # robots.txt 2017.07.01 # disallow: User-agent: SiteExplorer Findxbot GarlikCrawler ZoominfoBot BUbiNG # Barkrowler rogerbot dotbot JamesBOT Contacts-Crawler CCBot IDBot DnyzBot # PiplBot AlphaBot AlphaSeoBot AlphaSeoBot-SA # # robots.txt 2017.04.20 # disallow: User-agent: Qwantify # # robots.txt 2017.04.15 # 007ac9.net crawlers are seriously annoying. # While they do attempt to fetch the robots.txt file, # they do not use any user agent string when they do so, # and therfore they hit my "no user agent" rewrite rule. # They use a user agent string for everything else, but # do not identify themselves as a 007ac9 crawler, making # a user agent based rewrite rule impossible. # # disallow: User-agent: coccocbot-web # # robots.txt 2017.03.18 # disallow: User-agent: 007ac9 # although their bots do not identify themselves, # they claim to honor robots.txt. We'll see. # # robots.txt 2017.02.17 # disallow: User-agent: yoozBot # googlebot does ignore crawl delay. Take it out. # # robots.txt 2017.01.20 # googlebot crawls too fast. # Try a crawl delay directive for googlebot, # although I seem to recall it ignores it. # # robots.txt 2017.01.09 # disallow: User-agent: DomainCrawler # # robots.txt 2016.12.28 # disallow: User-agent: Cliqzbot # # robots.txt 2016.12.08 # disallow: User-agent: Seeker # # robots.txt 2016.09.04 # disallow: User-agent: Uptimebot # # robots.txt 2016.08.04 # disallow: User-agent: Sogou web spider # # robots.txt 2016.07.08 # I have been watching Qwantify. # It gets the same stuff over and over again, # rather often. Try a crawl delay. If that # doesn't help, I'll just disallow it. # # robots.txt 2016.04.03 # disallow: User-agent: RankActiveLinkBot # # robots.txt 2016.02.23 # Aboundex has already been added, it turns out twice. # try Abountdex only, in addition to the already # rule for Abountdexbot. # Will also be adding a re-write rule. # # robots.txt 2016.02.09 # disallow: User-agent: plukkie # disallow: User-agent: Applebot # disallow: User-agent: Lipperhey # disallow: User-agent: SafeDNSBot # # robots.txt 2016.01.09 # Try, does this work? # disallow: User-agent: gocrawl # # robots.txt 2015.10.25 # disallow: User-agent: NextGenSearchBot # # robots.txt 2015.10.17 # disallow: User-agent: parsijoo-bot # # robots.txt 2015.10.09 # disallow: User-agent: betaBot # # robots.txt 2015.09.13 # disallow: User-agent: RankSonicBot # # robots.txt 2015.09.08 # try this instead: # disallow: User-agent: yacybot # # robots.txt 2015.09.08 # disallow: User-agent: YaCy # I do not know if it should be # YaCy or yacybot. # # robots.txt 2015.08.25 # disallow: User-agent: thunderstone # # robots.txt 2015.08.19 # The Nutch disallow is not working. # The apache web site says it should work. # Try "tbot-nutch". # Oh my god, these bots are annoying. # # robots.txt 2015.07.20 # Google Search Appliance # disallow: User-agent: gsa-crawler # Apache Nutch-based bots. # dissallow: User-agent: Nutch # # robots.txt 2015.07.10 # disallow: User-agent: LSSRocketCrawler # # robots.txt 2015.06.20 # disallow: User-agent: YisouSpider # # robots.txt 2015.05.29 # Yet another demented bot. # disallow: User-agent: SMTBot # # robots.txt 2015.01.25 # Add some directives for slurp (Yahoo) # See also 2009.09.09 # # robots.txt 2015.01.22 # disallow: User-agent: ltx71 # disallow: User-agent: AdvBot # # robots.txt 2015.01.10 # Make sure all know to avoid bot_trap.html # # robots.txt 2014.12.31 # disallow: User-agent: memoryBot # Another day, another challenged bot. # # robots.txt 2014.11.22 # disallow: User-agent: MojeekBot # It doesn't need to check this file for every access. # # robots.txt 2014.11.13 # dissallow .mp3 # # robots.txt 2014.10.08 # There seems to be a lot more Bots lately. # disallow: User-agent: LoadTimeBot # disallow: User-agent: oBot # # robots.txt 2014.10.02 # disallow: User-agent: Riddler # disallow: User-agent: A6-Indexer # Seems to check robots.txt often. # disallow: User-agent: SemrushBot # Although, it doesn't seem to check this file anyhow. # # robots.txt 2014.09.29 # bingbot has had mental breakdown # and I'm fed up with it. # disallow: User-agent: bingbot # # robots.txt 2014.09.03 # disallow: User-agent: XoviBot # Although, it doesn't seem to check this file anyhow. # # robots.txt 2014.08.28 # bingbot needs to slow down. Use crawl_delay # # robots.txt 2014.05.15 # disallow: User-agent: Aboundexbot # No: already done. See 2013.09.02 # # robots.txt 2014.05.15 # disallow: User-agent: BLEXBot # # robots.txt 2014.04.18 # disallow: User-agent: wotbox # # robots.txt 2015.01.22 # disallow: User-agent: lx71EasouSpider # disallow: User-agent: SeznamBot # # robots.txt 2014.02.16 # disallow: User-agent: LinkpadBot # # robots.txt 2014.02.15 # disallow: User-agent: archive.org_bot # (The WayBack machine). # # robots.txt 2013.11.18 # disallow: User-agent: spbot # # robots.txt 2013.10.07 # disallow: User-agent: Mail.Ru # disallow: User-agent: meanpathbot # # robots.txt 2013.09.02 # Another day, another mentally challenged crawler. # disallow: User-agent: Aboundexbot # # robots.txt 2013.05.15 # Another day, another mentally challenged crawler. # disallow: User-agent: netEstate NE Crawler # # robots.txt 2013.04.25 # disallow ip-web-crawler.com. It crawls way too fast and while # it claims to obey robtos.txt directives, it does not. # If it doesn't obey the disallow, then an iptables drop # 50.31.96.6 - 50.31.96.12 could be used # # robots.txt 2013.04.17 # add some dissallow stuff for specific file extensions. # Somehow I missed it before. # # robots.txt 2013.04.04 # disallow Sosospider. Any web crawler that is too stupid to know the # difference between upper and lower case is not worthy. # # robots.txt 2013.02.28 # disallow Exabot. I wonder if the resulting search engine # database is the reason I get so many forged referrer # hits. # # robots.txt 2012.10.08 # disallow WBSearchBot. # # robots.txt 2012.09.02 # disallow SearchmetricsBot. It is mentally challenged. # # robots.txt 2012.05.03 # disallow TurnitinBot. It is mentally challenged. # # robots.txt 2012.03.29 # disallow EC2LinkFinder. I do not know if it obeys robots.txt, but I wll try. # For sure it ignores most robots.txt directives. It copies everything, hogging # bandwidth. # It is time to think of a generic deny, to cover all these new bots. # # robots.txt 2012.03.13 # disallow SWEBot. It is not polite and disobaeys robots.txt file. # # robots.txt 2012.01.29 # disallow aiHitBot # Try a useragent "InfoPath" and "InfoPath.2" dissallow. (Another MS thing.) # I am trying to get rid of what appears to be a tracking site. # 80.40.134.103, .104, .120, seem to track 92.9.131.199 and 92.9.150.29 and ... # 80.40.134.XXX does read the robots.txt file. # # robots.txt 2012.01.04 # SISTRIX crawler does not behave well. It ignores meta tags and some robots.txt directives. # Disallow it. # # robots.txt 2011.12.01 # Try to get rid of Ezooms bot, although it is not clear what its exact user agent name is. # (Days later: "User-agent: Ezooms" seems to work, but it takes a few days.) # It ignores meta tags, and has become generally annoying. # # robots.txt 2011.09.26 # Until now I have allowed Baiduspider. But it has gone mental and also ignores some meta tags. # Disallow it. # A new robot, AhrefsBot, does not behave or obey meta tags. # Disallow it. # # robots.txt 2011.06.19 # # robots.txt 2011.04.12 # Googlebot is so very very severely mentally challenged. # It ignores the NOFOLLOW meta tag. # Try to block useless content from being indexed via, yet another, # block command. # # It is still looking for pages that haven't been there for over a year now. # (see 2010.04.29) # # robots.txt 2010.10.14 # Eliminate crawl delay for Yahoo slurp (see 2007.03.13) # # robots.txt 2010.09.20 # TwengaBot is severely mentally challenged. Try global disallow for it. # Googlebot is still annoying and accessing pages it shouldn't. # # robots.txt 2010.04.29 # Googlebot is very severely mentally challenged. # Add disallow directives for directories that are not even there, # and haven't been for over 5 weeks now. # This is merely to try to get around having my request to delete the # non-existant directories from the search database being denied. # # robots.txt 2010.04.16 # Add specific directives for exabot, including a crawl delay. # Reduce the slurp (Yahoo) crawl delay (which it doesn't seem to obey anyhow). # Disallow googlebot-image. # # robots.txt 2010.04.13 # disallow taptubot, the mobile device crawler # # robots.txt 2010.04.01 # Yet another attempt to get web crawlers not to index old versions of index.html files. # All old version are called index_0???.html. # # robots.txt 2010.03.19 # Archives have been moved to a seperate directory. Add disallow directive. # # robots.txt 2010.02.10 # The Yandex web crawler behaves in a very strange manor. Block it. # Ask Robots not to copy PDF files. # # robots.txt 2009.12.07 # Fix some syntax based on feedback from http://tool.motoricerca.info/robots-checker.phtml # # robots.txt 2009.12.04 # There are still issues with googlebot. I don't want old versions of index.html # type pages indexed, but I do want the photoshop elements generated pages indexed. # Try some new directives. # # robots.txt 2009.09.09 # Googlebot is not ignoring the rebuilt directory and is obtaining .MOV videos. # Add some more googlebot specific directives. # # robots.txt 2009.07.27 # Googlebot directives are case sensitive. Add .JPG to .jpg ignore directives. # Googlebot is not ignoring old index pages as global directive indicates to. Try a googlebot # specific directive. # # robots.txt 2009.04.12 # Some robots, for example googlebot, obey global directives as well as googlebot specific directives. # Other robots, for example slurp (Yahoo) and msnbot, only obey their specific directives. # The robots.txt standard is rather weak, incomplete, and generally annoying. # Add tons of the same specific directives to each robot area. # Try to change no index Christmas pages to include a wildcard. # # robots.txt 2008.12.03 # Block the Cuil (twiceler) robot entirely. # # robots.txt 2008.11.23 # The majestic robot comes in bursts at a high rate. Just block it. # The Cuil robot comes to much. Try to slow it down. # # robots.txt 2008.07.03 # Now msnbot has started to grab images. Try to stop it. # Googlebot is grabbing PNG files. Try to stop it. # # robots.txt 2007.11.20 # Try to disallow the panscient.com web crawler. # # robots.txt 2007.08.23 # Still search engine pages do not agree with contents of robots.txt file. # Add specific disallow for ~doug/rebuilt. # - put global user agent lines after specific ones. # - next will be to repeat global lines in each specific agent area. # # robots.txt 2007.05.03 # Now Googlebot has started to grab images. Try to stop it. # For whatever reason, google is mainly showing my re-built directory. It # never seems to go back to the higher level page that now has meta tags # telling it not to index those pages. Put in a global disallow. # Add some other global disallows, that I got behind on. # # robots.txt 2007.03.13 # stupid yahoo slurp comes all the time now. It supports a non-standard delay command. # so add the command. The web site doesn't state the units of measure. # # robots.txt 2007.02.11 # yahoo, slurp seems to now obey the non-standard ignore this type of file wildcard usage # try it. # # robots.txt 2006.12.29 # Delete instructions for directories that don't exist anymore # # robots.txt 2004:12:21 # Try to eliminate yahoo.com grabbing images. # Can only think of global deny. # Can not find Yahoo name, try one shown below. # # robots.txt 2004:11:16 # Try to eliminate alexa.com grabbing images. # InkTomi comes too often, can them entirely. # # robots.txt 2004:07:16 # Try to eliminate picsearch.com grabbing images. # # robots.txt 2004:07:09 # Try to eliminate altavista grabbing images. # User-agent: FriendlyCrawler User-agent: ClaudeBot User-agent: fidget-spinner-bot User-agent: HubSpot User-agent: Cincraw User-agent: TinyTestBot User-agent: FunnelBack User-agent: OWLer User-agent: SenutoBot User-agent: okhttp User-agent: AwarioBot User-agent: MaCoCu User-agent: woorankreview User-agent: webprosbot User-agent: Twitterbot User-agent: Diffbot User-agent: Jambot User-agent: SeekBot User-agent: SeekportBot User-agent: Amazonbot User-agent: vuhuvBot User-agent: SERPtimizerBot User-agent: MixrankBot User-agent: YaK User-agent: INETDEX-BOT User-agent: Keybot Translation-Search-Machine User-agent: FlfBaldrBot User-agent: DomainStatsBot User-agent: SurdotlyBot User-agent: clark-crawler User-agent: InfoTigerBot User-agent: infotiger User-agent: Neevabot User-agent: Linguee User-agent: SemanticScholarBot User-agent: PetalBot User-agent: niocBot User-agent: Adsbot User-agent: CheckMarkNetwork User-agent: JobboerseBot User-agent: oBot User-agent: bbot User-agent: brands-bot-logo User-agent: Clarabot User-agent: serpstatbot User-agent: Seekport User-agent: Datanyze User-agent: Experibot User-agent: IndeedBot User-agent: ExtLinksBot User-agent: crawler4j User-agent: dataprovider User-agent: DAUM User-agent: MauiBot User-agent: panscient.com User-agent: vscooter User-agent: psbot User-agent: ia_archiver User-agent: MJ12bot User-agent: twiceler User-agent: Yandex User-agent: taptubot User-agent: Googlebot-Image User-agent: TwengaBot User-agent: sitebot User-agent: Baiduspider User-agent: AhrefsBot User-agent: Ezooms User-agent: sistrix User-agent: aiHitBot User-agent: InfoPath User-agent: InfoPath.2 User-agent: swebot User-agent: EC2LinkFinder User-agent: TurnitinBot User-agent: The Knowledge AI User-agent: Mappy Disallow: / # Some bots are stupid and # need their own personal Dissallow # test: User-agent: Friendly_Crawler Disallow: / User-agent: Bytespider Disallow: / User-agent: DataForSeoBot Disallow: / User-agent: IonCrawl Disallow: / User-agent: SearchmetericsBot User-agent: WBSearchBot User-agent: Exabot User-agent: Sosospider User-agent: ip-web-crawler.com User-agent: netEstate NE Crawler User-agent: Aboundexbot User-agent: Aboundex User-agent: meanpathbot User-agent: Mail.Ru User-agent: spbot User-agent: archive.org_bot User-agent: LinkpadBot User-agent: EasouSpider User-agent: SeznamBot User-agent: wotbox User-agent: BLEXBot User-agent: XoviBot User-agent: SemrushBot User-agent: A6-Indexer User-agent: Riddler User-agent: LoadTimeBot User-agent: oBot User-agent: MojeekBot User-agent: memoryBot User-agent: ltx71 Disallow: / User-agent: AdvBot User-agent: SMTBot User-agent: YisouSpider User-agent: LSSRocketCrawler User-agent: gsa-crawler User-agent: Nutch User-agent: tbot-nutch User-agent: thunderstone User-agent: yacybot User-agent: RankSonicBot User-agent: betaBot User-agent: parsijoo-bot User-agent: NextGenSearchBot User-agent: gocrawl User-agent: plukkie User-agent: Applebot User-agent: Lipperhey User-agent: SafeDNSBot User-agent: RankActiveLinkBot User-agent: Sogou blog User-agent: Sogou inst spider User-agent: Sogou News Spider User-agent: Sogou Orion spider User-agent: Sogou spider2 User-agent: Sogou web spider User-agent: Uptimebot User-agent: Seeker User-agent: Cliqzbot User-agent: DomainCrawler User-agent: yoozBot User-agent: 007ac9 User-agent: coccocbot-web User-agent: Qwantify User-agent: SiteExplorer User-agent: Findxbot User-agent: GarlikCrawler User-agent: ZoominfoBot User-agent: BUbiNG User-agent: Barkrowler User-agent: rogerbot User-agent: dotbot User-agent: JamesBOT User-agent: Contacts-Crawler User-agent: CCBot User-agent: IDBot User-agent: DnyzBot User-agent: PiplBot User-agent: AlphaBot User-agent: AlphaSeoBot User-agent: AlphaSeoBot-SA User-agent: SEOkicks-Robot Disallow: / User-agent: msnbot User-agent: bingbot Disallow: / User-agent: Slurp Disallow: /*.jpg Disallow: /*.JPG Disallow: /*.png Disallow: /*.PDF Disallow: /*.pdf Disallow: /*.mp3 Disallow: /*.mp4 Disallow: /*.MOV Disallow: /*.mov Disallow: /*.AVI Disallow: /*.avi Disallow: /*.svg Disallow: /*.csv Disallow: /*.data Disallow: /disclaimer.html Disallow: /security.html Disallow: /poweredby.html Disallow: /about_smythies.html Disallow: /unused_link.html Disallow: /old_pages.html Disallow: /index_0* Disallow: /*index_0*$ Disallow: /digital_camera/ Disallow: /lab/ Disallow: /xmas_* Disallow: /~doug/archives/ Disallow: /~doug/linux/ubuntu-docs/help.ubuntu.com/ Disallow: /~doug/linux/s18/misc/ Disallow: /~doug/linux/s18/hwp/doug/ Disallow: /~doug/linux/s18/hwp/srinivas/ Disallow: /~doug/linux/s18/hwp/k510-rc6/ Disallow: /bot_trap.html User-agent: Googlebot Disallow: /*.jpg$ Disallow: /*.JPG$ Disallow: /*.png$ Disallow: /*.PDF$ Disallow: /*.pdf$ Disallow: /*.mp3$ Disallow: /*.mp4$ Disallow: /*.MOV$ Disallow: /*.mov$ Disallow: /*.AVI$ Disallow: /*.avi$ Disallow: /*.csv$ Disallow: /*.svg$ Disallow: /*.data$ Disallow: /index_0*$ Disallow: /*index_0*$ Disallow: /xmas_* Disallow: /~doug/archives/ Disallow: /~doug/linux/ubuntu-docs/help.ubuntu.com/ Disallow: /~doug/linux/s18/misc/ Disallow: /~doug/linux/s18/hwp/doug/ Disallow: /~doug/linux/s18/hwp/srinivas/ Disallow: /~doug/linux/s18/hwp/k510-rc6/ Disallow: /~doug/2010.01.23/ Disallow: /~doug/2007.11.20/ Disallow: /~doug/2004.06.26/ Disallow: /digital_camera/ Disallow: /old_pages.html Disallow: /unused_link.html Disallow: /disclaimer.html Disallow: /security.html Disallow: /about_smythies.html Disallow: /poweredby.html Disallow: /DSCN*.htm Disallow: /bot_trap.html User-agent: * Disallow: /*.jpg Disallow: /*.JPG Disallow: /*.png Disallow: /*.PDF Disallow: /*.pdf Disallow: /*.mp3 Disallow: /*.mp4 Disallow: /*.MOV Disallow: /*.mov Disallow: /*.AVI Disallow: /*.avi Disallow: /*.svg Disallow: /*.csv Disallow: /*.data Disallow: /disclaimer.html Disallow: /security.html Disallow: /poweredby.html Disallow: /about_smythies.html Disallow: /unused_link.html Disallow: /old_pages.html Disallow: /index_0* Disallow: /*index_0*$ Disallow: /digital_camera/ Disallow: /lab/ Disallow: /xmas_* Disallow: /~doug/archives/ Disallow: /~doug/linux/ubuntu-docs/help.ubuntu.com/ Disallow: /~doug/linux/s18/misc/ Disallow: /~doug/linux/s18/hwp/doug/ Disallow: /~doug/linux/s18/hwp/srinivas/ Disallow: /~doug/linux/s18/hwp/k510-rc6/ Disallow: /bot_trap.html