This Blog is just for Educational Purpose. This Blog provide information about python and python frameworks : Django, Selenium, Scrapy etc.
Pages
- Home
- 0. Getting Started with Selenium
- 1. Moving To a Selenium Example
- 2. Clear Concept Using Python Selenium
- 3. Creating Test Cases Using Python and Unittest IN Selenium
- 4. Python Webdriver API's
- 5.Locating Elements On a Web Page
- 6. Selenium Examples For YOu
- 7. How to Choose Programming Language - Selenium
Python Insider: Python 3.3.3 relased
Python Insider: Python 3.3.3 relased: A new maintenance release, Python 3.3.3 , has been released on November 13 2013.
Recursive Scraping using different styles with MySQL Database.
Hi,
I don't have too much time to write Scrapy Tutorial for you people but this example may be a help full hand to the scrapy beginners.
this program recursively scrap youtube.com and the link for scraping comes from s database and the results store in database.
here is
Items.py
---------------------------------------------------------------------------------------------------------------------------------
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class Project2Item(Item):
# define the fields for your item here like:
# name = Field()
videoLink = Field()
videoDuration= Field()
imageVideo = Field()
titleVideo = Field()
Description = Field()
view = Field()
pass
---------------------------------------------------------------------------------------------------------------------
here is your pipelines.py
-------------------------------------------------------------------------------------------------------------------------------
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import re
class Project2Pipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(host='localhost',
user ='root',
passwd = 'froot',
db = 'prime',
charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""insert into scrapyTest (videoLink, videoTitle,videoView,flag)
VALUES (%s, %s, %s,1)""",(item['videoLink'],item['titleVideo'][0],item['view'][0]))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
--------------------------------------------------------------------------------------------------------------------------------
here is your settings.py
======================================================================
# Scrapy settings for project2 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'project2'
SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
CONCURRENT_REQUESTS_PER_DOMAIN= 1
ITEM_PIPELINES = {
'project2.pipelines.Project2Pipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'project2 (+http://www.yourdomain.com)'
-------------------------------------------------------------------------------------------------------------------------------
here is your different spiders.py files:
========================================================================
from scrapy.spider import Spider
from scrapy.selector import Selector
from project2.items import Project2Item
import urlparse
from scrapy.http.request import Request
import MySQLdb
class spider(Spider):
name="Youtube"
allowed_domains = ["youtube.com"]
conn = MySQLdb.connect(host='localhost',user='root',passwd = 'froot',db = 'prime')
cursor = conn.cursor()
cursor.execute("select pageLink from scrapyTest where flag=2")
row = [item[0] for item in cursor.fetchall()]
print row
start_urls = row #["https://www.youtube.com/results?search_query=live+music+INdia&page=1"]
def parse(self,response):
sel = Selector(response)
next_page = sel.xpath("//div[@role='navigation']/a[@data-link-type='next']/@href").extract()
listing = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in listing:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
for item in items:
yield item
if not not next_page:
yield Request("http://www.youtube.com"+next_page[0],self.parse)
---------------------------------------------------------------------------------------------------------------------------------
second method:
========================================================================
from scrapy.selector import Selector
from scrapy.spider import Spider
from project2.items import Project2Item
from scrapy.http.request import Request
class youtubespider(Spider):
name = "youtube2"
allowed_domains = "www.youtube.com"
links=[]
for page in range(50):
x = 'https://www.youtube.com/results?search_query=live+music+concert+india&page=%s'%page
links.append(x)
start_urls = links
def parse(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items
=======================================================================
third method:
----------------------------------------------------------------------------------------------------------------------------------
from scrapy.selector import Selector
from project2.items import Project2Item
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class youtubespider(CrawlSpider):
name = "bollywoodMovies"
allowed_domains = ["www.youtube.com"]
start_urls = ['https://www.youtube.com/results?search_query=bollywood+movies&page=1']
rules = (Rule (SgmlLinkExtractor(allow=("page=\d+", )), callback="parseitem",follow=True,), )
def parseitem(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[2]/span/@data-original-html").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items
========================================================================
fourth style: Try Youself
I hope this may be help full for you
----------------------------------------------------------------------------------------------------------------------------------
I don't have too much time to write Scrapy Tutorial for you people but this example may be a help full hand to the scrapy beginners.
this program recursively scrap youtube.com and the link for scraping comes from s database and the results store in database.
here is
Items.py
---------------------------------------------------------------------------------------------------------------------------------
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class Project2Item(Item):
# define the fields for your item here like:
# name = Field()
videoLink = Field()
videoDuration= Field()
imageVideo = Field()
titleVideo = Field()
Description = Field()
view = Field()
pass
---------------------------------------------------------------------------------------------------------------------
here is your pipelines.py
-------------------------------------------------------------------------------------------------------------------------------
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import MySQLdb
import re
class Project2Pipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(host='localhost',
user ='root',
passwd = 'froot',
db = 'prime',
charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""insert into scrapyTest (videoLink, videoTitle,videoView,flag)
VALUES (%s, %s, %s,1)""",(item['videoLink'],item['titleVideo'][0],item['view'][0]))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
--------------------------------------------------------------------------------------------------------------------------------
here is your settings.py
======================================================================
# Scrapy settings for project2 project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'project2'
SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
CONCURRENT_REQUESTS_PER_DOMAIN= 1
ITEM_PIPELINES = {
'project2.pipelines.Project2Pipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'project2 (+http://www.yourdomain.com)'
-------------------------------------------------------------------------------------------------------------------------------
here is your different spiders.py files:
========================================================================
from scrapy.spider import Spider
from scrapy.selector import Selector
from project2.items import Project2Item
import urlparse
from scrapy.http.request import Request
import MySQLdb
class spider(Spider):
name="Youtube"
allowed_domains = ["youtube.com"]
conn = MySQLdb.connect(host='localhost',user='root',passwd = 'froot',db = 'prime')
cursor = conn.cursor()
cursor.execute("select pageLink from scrapyTest where flag=2")
row = [item[0] for item in cursor.fetchall()]
print row
start_urls = row #["https://www.youtube.com/results?search_query=live+music+INdia&page=1"]
def parse(self,response):
sel = Selector(response)
next_page = sel.xpath("//div[@role='navigation']/a[@data-link-type='next']/@href").extract()
listing = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in listing:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
for item in items:
yield item
if not not next_page:
yield Request("http://www.youtube.com"+next_page[0],self.parse)
---------------------------------------------------------------------------------------------------------------------------------
second method:
========================================================================
from scrapy.selector import Selector
from scrapy.spider import Spider
from project2.items import Project2Item
from scrapy.http.request import Request
class youtubespider(Spider):
name = "youtube2"
allowed_domains = "www.youtube.com"
links=[]
for page in range(50):
x = 'https://www.youtube.com/results?search_query=live+music+concert+india&page=%s'%page
links.append(x)
start_urls = links
def parse(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items
=======================================================================
third method:
----------------------------------------------------------------------------------------------------------------------------------
from scrapy.selector import Selector
from project2.items import Project2Item
from scrapy.http.request import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class youtubespider(CrawlSpider):
name = "bollywoodMovies"
allowed_domains = ["www.youtube.com"]
start_urls = ['https://www.youtube.com/results?search_query=bollywood+movies&page=1']
rules = (Rule (SgmlLinkExtractor(allow=("page=\d+", )), callback="parseitem",follow=True,), )
def parseitem(self,response):
sel = Selector(response)
sites = sel.xpath("//ol[@id='search-results']/li")
items=[]
for listi in sites:
item = Project2Item()
item['videoDuration']=listi.xpath("div[1]/a/span[@class='video-time']/text()").extract()
item['videoLink'] = "www.youtube.com"+listi.xpath("div[1]/a/@href").extract()[0]
item['imageVideo']=listi.xpath('div[1]/a/button//span/img/@src').extract()
item['titleVideo']=listi.xpath('div[2]/h3/a/@title').extract()
item['Description']=listi.xpath("div[2]/div[2]/span/@data-original-html").extract()
item['view']= listi.xpath("div[2]/div[@class='yt-lockup-meta']/ul/li[3]/text()").extract()
items.append(item)
return items
========================================================================
fourth style: Try Youself
I hope this may be help full for you
----------------------------------------------------------------------------------------------------------------------------------
Selenium Example for iphone and android
from selenium import webdriver# iPhonedriver = webdriver.Remote(browser_name="iphone", command_executor='http://172.24.101.36:3001/hub')# Androiddriver = webdriver.Remote(browser_name="android", command_executor='http://127.0.0.1:8080/hub')# Google Chromedriver = webdriver.Chrome()# Firefoxdriver = webdriver.Firefox()# ------------------------------# The actual test scenario: Test the codepad.org code execution service.# Go to codepad.orgdriver.get('http://codepad.org')# Select the Python language optionpython_link = driver.find_elements_by_xpath("//input[@name='lang' and @value='Python']")[0]python_link.click()# Enter some text!text_area = driver.find_element_by_id('textarea')text_area.send_keys("print 'Hello,' + ' World!'")# Submit the form!submit_button = driver.find_element_by_name('submit')submit_button.click()# Make this an actual test. Isn't Python beautiful?assert "Hello, World!" in driver.get_page_source()# Close the browser!driver.quit()
Subscribe to:
Posts (Atom)